Voyage/scrape_quest.py

#!/usr/bin/env python3
"""
Extracts post data from a quest archive.
"""
import os
import re
import datetime

import bs4
import psycopg2

import config

BQ_DIR = '/var/www/html/banished/'
QM_NAMES = ['Soma', 'Somas']

con = psycopg2.connect(**config.db)
cur = con.cursor()

def init_db():
	"""Initializes the database if it hasn't been already."""
	with open('voyage.sql', 'r') as file:
		cur.execute(file.read())


def scrape_posts(root_dir):
	"""Walks the `root_dir` and extracts data from index.html files found."""
	for dir_name, sub_dirs, files in os.walk(root_dir):
		for file in files:
			if file != 'index.html':
				continue
			filepath = os.path.join(root_dir, dir_name, file)
			print("Processing:", filepath)
			with open(filepath, 'r') as file:
				data = file.read()
				data = re.sub(r'<wbr ?\/?>', '', data)
				soup = bs4.BeautifulSoup(data, 'html.parser')

			mobiles = soup.find_all(class_='mobile')
			for tag in mobiles:
				tag.decompose()

			thread_id = int(soup.find(class_='thread').get('id')[1:])
			thread_title = soup.find(class_='subject').text
			thread_time = int(soup.find(class_='dateTime').get('data-utc'))
			thread_time = datetime.datetime.utcfromtimestamp(thread_time)
			thread_time = thread_time.replace(tzinfo=datetime.timezone.utc)
			cur.execute("INSERT INTO thread VALUES (%s,%s,%s)",
				(thread_id, thread_title, thread_time)
			)
			posts = soup.find_all(class_='postContainer')
			for post in posts:
				# information gathering
				post_id = int(post.get('id')[2:])
				subject = post.find(class_='subject')
				if subject:
					subject = subject.text
				name = post.find(class_='name').text
				user_email = post.find(class_='useremail')
				if user_email:
					user_email = user_email.get('href')
				tripcode = post.find(class_='postertrip')
				if tripcode:
					tripcode = tripcode.text
				user_id = post.find(class_='hand')
				if user_id:
					user_id = user_id.text
				post_time = int(post.find(class_='dateTime').get('data-utc'))
				post_time = datetime.datetime.utcfromtimestamp(post_time)
				post_time = post_time.replace(tzinfo=datetime.timezone.utc)
				chan_file_name = None
				file_name = None
				file_text = post.find(class_='fileText')
				if file_text:
					chan_file_name =file_text.a.get('href').rpartition('/')[2]
					original_file_name = file_text.a.text
				post_body = post.find(class_='postMessage').get_text('\n')

				links = post.find_all(class_='quotelink')
				links = [l for l in links if l.get('href').startswith('#')]
				links = [int(link.text[2:]) for link in links]
				links = list(set(links))

				# heuristics
				tags = set()
				if name in QM_NAMES:
					tags.add('qm_post')
					# also counts shitposters and broken tripcodes
					tags.add('story_post')
					# assume every QM post is also a story post, until
					# proven otherwise
					if "dropped my trip" in post_body.lower():
						for link in links:
							cur.execute("INSERT INTO tag "
								"VALUES (%s,%s), (%s,%s), (%s,%s)",
								(link, 'qm_post',
									link, 'story_post',
									link, 'dropped_trip'
								)
							)
							# dropped trip doesn't necessarily mean story_post
						tags.add('dropped_trip')
					if len(links) > 1:
						tags.add('tally_post')
						# also counts Q&A posts
						if 'story_post' in tags:
							tags.remove('story_post')
					if 'writin' in post_body.lower():
						tags.add('tally_post')
						if 'story_post' in tags:
							tags.remove('story_post')
					if posts.index(post) == 0:
						tags.add('op_post')
						if "Welcome to Banished Quest!" in post_body:
							if 'story_post' in tags:
								tags.remove('story_post')
					if re.search(r'ro+l+ me', post_body.lower()):
						tags.add('dice_call')
						if 'story_post' in tags:
							tags.remove('story_post')
					if re.search(r'roll .*3d10', post_body.lower()):
						tags.add('dice_call')
					if 'final destination' in post_body.lower():
						tags.add('final_destination')
						if 'story_post' in tags:
							tags.remove('story_post')
					if 'story_post' in tags:
						if len(re.findall(r'\n>', post_body)) > 1:
							tags.add('vote_choices')

				# database insert
				cur.execute(
					"INSERT INTO post VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s)",
					(thread_id, post_id, subject, name, user_email, tripcode,
					user_id, post_time, post_body)
				)
				for link in links:
					cur.execute("INSERT INTO link VALUES (%s,%s)",
						(post_id, link)
					)
				for tag in tags:
					cur.execute("INSERT INTO tag VALUES (%s,%s)",
						(post_id, tag)
					)
				if file_text:
					cur.execute("INSERT INTO file VALUES (%s,%s,%s)",
						(post_id, chan_file_name, original_file_name)
					)


if __name__ == '__main__':
	init_db()
	scrape_posts(os.path.join(BQ_DIR, 'archive'))
	scrape_posts(os.path.join(BQ_DIR, 'qstarchive'))
	con.commit()
	con.close()