sopel/modules/url.py

#! /usr/bin/env python3
# -*- coding: utf-8 -*-
"""
URL parsing.
"""
import re
from module import rule
from tools import web
from config.types import ValidatedAttribute, ListAttribute, StaticSection
import requests
from urllib.parse import urlparse
from html.parser import HTMLParser

headers = {"User-Agent": "bix nood gimme the title", "Range": "bytes=0-4096"}

class UrlSection(StaticSection):
	# TODO some validation rules maybe?
	exclude = ListAttribute('exclude')
	exclusion_char = ValidatedAttribute('exclusion_char', default='!')


def configure(config):
	config.define_section('url', UrlSection)
	config.url.configure_setting(
		'exclude',
		'Enter regular expressions for each URL you would like to exclude.'
	)
	config.url.configure_setting(
		'exclusion_char',
		'Enter a character which can be prefixed to suppress URL titling'
	)


@rule('(?u).*(https?://\S+).*')
def title_auto(bot, trigger):
	"""
	Automatically show titles for URLs. For shortened URLs/redirects, find
	where the URL redirects to and show the title for that.
	"""
	url_finder = re.compile(r'(?u)(%s?(?:http|https|ftp)(?:://\S+))' %
							(bot.config.url.exclusion_char), re.IGNORECASE)
	if re.match(bot.config.core.prefix + 'title', trigger):
		return

	urls = re.findall(url_finder, trigger)
	if len(urls) == 0:
		return

	for url in urls:
		# Avoid fetching known malicious links
		if not web.secCheck(bot, url):
			continue
		try:
			res = requests.get(url, headers=headers, verify=True)
		except requests.exceptions.ConnectionError:
			continue
		if res.status_code == 404:
			continue
		res.raise_for_status()
		if not res.headers["Content-Type"].startswith("text/html"):
			continue
		if res.text.find("<title>") == -1:
			continue
		title = res.text[res.text.find("<title>")+7:res.text.find("</title>")]
		title = HTMLParser().unescape(title)
		title = title.replace("\n","").strip()
		hostname = urlparse(url).hostname
		bot.say('[ \x0310%s \x03] - \x0304%s' % (title, hostname))