#! /usr/bin/env python3 # -*- coding: utf-8 -*- """ URL parsing. """ import re from module import rule from tools import web from config.types import ValidatedAttribute, ListAttribute, StaticSection import requests from urllib.parse import urlparse from html.parser import HTMLParser headers = {"User-Agent": "bix nood gimme the title", "Range": "bytes=0-4096"} class UrlSection(StaticSection): # TODO some validation rules maybe? exclude = ListAttribute('exclude') exclusion_char = ValidatedAttribute('exclusion_char', default='!') def configure(config): config.define_section('url', UrlSection) config.url.configure_setting( 'exclude', 'Enter regular expressions for each URL you would like to exclude.' ) config.url.configure_setting( 'exclusion_char', 'Enter a character which can be prefixed to suppress URL titling' ) @rule('(?u).*(https?://\S+).*') def title_auto(bot, trigger): """ Automatically show titles for URLs. For shortened URLs/redirects, find where the URL redirects to and show the title for that. """ url_finder = re.compile(r'(?u)(%s?(?:http|https|ftp)(?:://\S+))' % (bot.config.url.exclusion_char), re.IGNORECASE) if re.match(bot.config.core.prefix + 'title', trigger): return urls = re.findall(url_finder, trigger) if len(urls) == 0: return for url in urls: # Avoid fetching known malicious links if not web.secCheck(bot, url): continue try: res = requests.get(url, headers=headers, verify=True) except requests.exceptions.ConnectionError: continue if res.status_code == 404: continue res.raise_for_status() if not res.headers["Content-Type"].startswith("text/html"): continue if res.text.find("") == -1: continue title = res.text[res.text.find("<title>")+7:res.text.find("")] title = HTMLParser().unescape(title) title = title.replace("\n","").strip() hostname = urlparse(url).hostname bot.say('[ \x0310%s \x03] - \x0304%s' % (title, hostname))