From 92aa5bb2329c39cf97d4399839989e7401820ae4 Mon Sep 17 00:00:00 2001 From: lassulus Date: Fri, 6 Jul 2018 17:42:04 +0200 Subject: Reaktor url-title: fix some issues with weird urls ref: https://irc-bot-science.clsr.net/ --- krebs/5pkgs/simple/Reaktor/plugins.nix | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/krebs/5pkgs/simple/Reaktor/plugins.nix b/krebs/5pkgs/simple/Reaktor/plugins.nix index cd389366e..4a7917b68 100644 --- a/krebs/5pkgs/simple/Reaktor/plugins.nix +++ b/krebs/5pkgs/simple/Reaktor/plugins.nix @@ -121,21 +121,27 @@ rec { pattern = "^.*(?Phttp[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+).*$$"; path = with pkgs; [ curl perl ]; script = pkgs.writePython3 "url-title" [ "beautifulsoup4" "lxml" ] '' + import cgi import sys import urllib.request from bs4 import BeautifulSoup try: - soup = BeautifulSoup(urllib.request.urlopen(sys.argv[1]), "lxml") - title = soup.find('title').string + resp = urllib.request.urlopen(sys.argv[1]) + if resp.headers['content-type'].find('text/html') >= 0: + soup = BeautifulSoup(resp.read(16000), "lxml") + title = soup.find('title').string - if title: - if len(title) > 512: - print('message to long, skipped') - elif len(title.split('\n')) > 5: - print('to many lines, skipped') - else: - print(title) + if title: + if len(title) > 450: + print('message to long, rest skipped') + elif len(title.split('\n')) > 5: + print('to many lines, skipped') + else: + print(title) + else: + cd_header = resp.headers['content-disposition'] + print(cgi.parse_header(cd_header)[1]['filename']) except: # noqa: E722 pass ''; -- cgit v1.2.3