sopel/modules/url.py

69 lines
1.9 KiB
Python
Raw Normal View History

2017-11-22 19:26:40 -05:00
#! /usr/bin/env python3
# -*- coding: utf-8 -*-
"""
URL parsing.
"""
import re
from module import rule
from tools import web
from config.types import ValidatedAttribute, ListAttribute, StaticSection
import requests
from urllib.parse import urlparse
from html.parser import HTMLParser
headers = {"User-Agent": "bix nood gimme the title", "Range": "bytes=0-4096"}
class UrlSection(StaticSection):
# TODO some validation rules maybe?
exclude = ListAttribute('exclude')
exclusion_char = ValidatedAttribute('exclusion_char', default='!')
def configure(config):
config.define_section('url', UrlSection)
config.url.configure_setting(
'exclude',
'Enter regular expressions for each URL you would like to exclude.'
)
config.url.configure_setting(
'exclusion_char',
'Enter a character which can be prefixed to suppress URL titling'
)
@rule('(?u).*(https?://\S+).*')
def title_auto(bot, trigger):
"""
Automatically show titles for URLs. For shortened URLs/redirects, find
where the URL redirects to and show the title for that.
"""
url_finder = re.compile(r'(?u)(%s?(?:http|https|ftp)(?:://\S+))' %
(bot.config.url.exclusion_char), re.IGNORECASE)
if re.match(bot.config.core.prefix + 'title', trigger):
return
urls = re.findall(url_finder, trigger)
if len(urls) == 0:
return
for url in urls:
# Avoid fetching known malicious links
if not web.secCheck(bot, url):
continue
try:
res = requests.get(url, headers=headers, verify=True)
except requests.exceptions.ConnectionError:
continue
if res.status_code == 404:
continue
res.raise_for_status()
if not res.headers["Content-Type"].startswith("text/html"):
continue
if res.text.find("<title>") == -1:
continue
title = res.text[res.text.find("<title>")+7:res.text.find("</title>")]
title = HTMLParser().unescape(title)
title = title.replace("\n","").strip()
hostname = urlparse(url).hostname
bot.say('[ \x0310%s \x03] - \x0304%s' % (title, hostname))