fulvia/modules/url.py

#!/usr/bin/env python3
"""
URL parsing.
"""
import re
import html
from urllib.parse import urlparse

import requests

from module import hook, url_callback

HEADERS = {"User-Agent": "Give me your data.", "Range": "bytes=0-4096"}

@url_callback('puu.sh/')
def get_puush_fname(bot, url):
	"""
	Callback puu.sh links. Gets the filename and displays it.
	"""
	try:
		res = requests.head(url)
	except requests.exceptions.RequestException:
		return

	fname = res.headers.get('Content-Disposition', '')
	fname = re.search(r'\"(.*)\"', fname).group(1)
	if not fname:
		return

	bot.msg(f"[ \x0310puu.sh \x03] - \x0304{fname}")


@url_callback("youtube.com/watch")
@url_callback("youtu.be/")
def youtube_title(bot, url):
	"""
	Retrieve the title of the YouTube video and display it.
	"""
	url = "https://www.youtube.com/oembed?url=" + url
	res = requests.get(url)
	res.raise_for_status()

	title = res.json()['title']
	hostname = urlparse(url).hostname

	bot.msg(f"[ \x0310{title} \x03] - \x0304{hostname}")


@hook(True)
def title_auto(bot, trigger):
	"""
	Automatically show titles for URLs. For shortened URLs/redirects, find
	where the URL redirects to and show the title for that.
	"""
	if "http" not in ' '.join(trigger.args):
		return
	url_finder = re.compile(r"((?:http|https)(?::\/\/\S+))", re.IGNORECASE)

	urls = re.findall(url_finder, ' '.join(trigger.args))
	if len(urls) == 0:
		return

	for url in urls:
		url = url.replace('twitter.com', 'nitter.net')
		broken = False
		for key in bot.url_callbacks:
			if key in url:
				bot.url_callbacks[key](bot, url)
				broken = True
		if broken:
			continue
		try:
			res = requests.get(url, headers=HEADERS, verify=True, timeout=10)
		except (requests.exceptions.ConnectionError,
				requests.exceptions.ReadTimeout):
			continue
		try:
			res.raise_for_status()
		except:
			continue
		if not res.headers.get("Content-Type"):
			continue
		if not res.headers.get("Content-Type").startswith("text/html"):
			continue
		if res.text.find("<title>") == -1:
			continue
		title = res.text[res.text.find("<title>")+7:res.text.find("</title>")]
		title = html.unescape(title)
		title = title.replace("\n","").strip()
		hostname = urlparse(url).hostname
		bot.msg(f"[ \x0310{title} \x03] - \x0304{hostname}")
initial commit 2018-03-16 03:13:43 -04:00			`#!/usr/bin/env python3`
			`"""`
			`URL parsing.`
			`"""`
			`import re`
fix title parsing library reference 2021-01-10 17:45:51 -05:00			`import html`
initial commit 2018-03-16 03:13:43 -04:00			`from urllib.parse import urlparse`

			`import requests`

added puu.sh filename support 2018-09-19 13:01:19 -04:00			`from module import hook, url_callback`
initial commit 2018-03-16 03:13:43 -04:00
remove naughty words 2020-01-08 07:51:00 -05:00			`HEADERS = {"User-Agent": "Give me your data.", "Range": "bytes=0-4096"}`
initial commit 2018-03-16 03:13:43 -04:00
added puu.sh filename support 2018-09-19 13:01:19 -04:00			`@url_callback('puu.sh/')`
			`def get_puush_fname(bot, url):`
			`"""`
			`Callback puu.sh links. Gets the filename and displays it.`
			`"""`
			`try:`
			`res = requests.head(url)`
			`except requests.exceptions.RequestException:`
			`return`

			`fname = res.headers.get('Content-Disposition', '')`
			`fname = re.search(r'\"(.*)\"', fname).group(1)`
			`if not fname:`
			`return`

			`bot.msg(f"[ \x0310puu.sh \x03] - \x0304{fname}")`


bugfix youtube url titles 2020-08-21 01:03:00 -04:00			`@url_callback("youtube.com/watch")`
fix youtube url matching error 2021-01-10 15:00:49 -05:00			`@url_callback("youtu.be/")`
merge youtube module into url module, add twitter/nitter support 2020-07-07 07:57:42 -04:00			`def youtube_title(bot, url):`
			`"""`
			`Retrieve the title of the YouTube video and display it.`
			`"""`
			`url = "https://www.youtube.com/oembed?url=" + url`
			`res = requests.get(url)`
			`res.raise_for_status()`

			`title = res.json()['title']`
			`hostname = urlparse(url).hostname`

			`bot.msg(f"[ \x0310{title} \x03] - \x0304{hostname}")`


initial commit 2018-03-16 03:13:43 -04:00			`@hook(True)`
			`def title_auto(bot, trigger):`
			`"""`
			`Automatically show titles for URLs. For shortened URLs/redirects, find`
			`where the URL redirects to and show the title for that.`
			`"""`
refactored trigger 2020-01-07 18:58:19 -05:00			`if "http" not in ' '.join(trigger.args):`
initial commit 2018-03-16 03:13:43 -04:00			`return`
			`url_finder = re.compile(r"((?:http\|https)(?::\/\/\S+))", re.IGNORECASE)`

refactored trigger 2020-01-07 18:58:19 -05:00			`urls = re.findall(url_finder, ' '.join(trigger.args))`
initial commit 2018-03-16 03:13:43 -04:00			`if len(urls) == 0:`
			`return`

			`for url in urls:`
merge youtube module into url module, add twitter/nitter support 2020-07-07 07:57:42 -04:00			`url = url.replace('twitter.com', 'nitter.net')`
initial commit 2018-03-16 03:13:43 -04:00			`broken = False`
			`for key in bot.url_callbacks:`
			`if key in url:`
			`bot.url_callbacks[key](bot, url)`
			`broken = True`
			`if broken:`
			`continue`
			`try:`
reordered calc.py's aliases, changed remind.py to wait on stop condition, added timeout to url.py 2018-06-11 12:49:36 -04:00			`res = requests.get(url, headers=HEADERS, verify=True, timeout=10)`
timeout exception for url titles 2018-07-05 06:58:12 -04:00			`except (requests.exceptions.ConnectionError,`
bugfix 2018-07-12 20:04:45 -04:00			`requests.exceptions.ReadTimeout):`
initial commit 2018-03-16 03:13:43 -04:00			`continue`
			`try:`
			`res.raise_for_status()`
			`except:`
			`continue`
modeChanged() works properly, small changes to module loading 2018-05-27 14:16:50 -04:00			`if not res.headers.get("Content-Type"):`
			`continue`
			`if not res.headers.get("Content-Type").startswith("text/html"):`
initial commit 2018-03-16 03:13:43 -04:00			`continue`
			`if res.text.find("<title>") == -1:`
			`continue`
			`title = res.text[res.text.find("<title>")+7:res.text.find("</title>")]`
fix title parsing library reference 2021-01-10 17:45:51 -05:00			`title = html.unescape(title)`
initial commit 2018-03-16 03:13:43 -04:00			`title = title.replace("\n","").strip()`
			`hostname = urlparse(url).hostname`
changed bot.say to bot.msg 2018-05-25 15:21:18 -04:00			`bot.msg(f"[ \x0310{title} \x03] - \x0304{hostname}")`