Voyage/scrape_quest.py

#!/usr/bin/env python3
"""
Extracts post data from a quest archive.
"""
import os
import re
import datetime

import bs4
import psycopg2

import config

BQ_DIR = '/var/www/html/banished/'
QM_NAMES = ['Soma', 'Somas']

con = psycopg2.connect(**config.db)
cur = con.cursor()

def init_db():
	"""Initializes the database if it hasn't been already."""
	with open('voyage.sql', 'r') as file:
		cur.execute(file.read())


def scrape_posts(root_dir):
	"""Walks the `root_dir` and extracts data from index.html files found."""
	for dir_name, sub_dirs, files in os.walk(root_dir):
		for file in files:
			if file != 'index.html':
				continue
			filepath = os.path.join(root_dir, dir_name, file)
			print("Processing:", filepath)
			with open(filepath, 'r') as file:
				soup = bs4.BeautifulSoup(file.read(), 'html.parser')

			mobiles = soup.find_all(class_='mobile')
			for tag in mobiles:
				tag.decompose()

			thread_id = int(soup.find(class_='thread').get('id')[1:])
			thread_title = soup.find(class_='subject').text
			thread_time = int(soup.find(class_='dateTime').get('data-utc'))
			thread_time = datetime.datetime.utcfromtimestamp(thread_time)
			thread_time = thread_time.replace(tzinfo=datetime.timezone.utc)
			cur.execute("INSERT INTO thread VALUES (%s,%s,%s)",
				(thread_id, thread_title, thread_time)
			)
			posts = soup.find_all(class_='postContainer')
			for post in posts:
				# information gathering
				post_id = int(post.get('id')[2:])
				name = post.find(class_='name').text
				tripcode = post.find(class_='postertrip')
				if tripcode:
					tripcode = tripcode.text
				subject = post.find(class_='subject')
				if subject:
					subject = subject.text
				post_time = int(post.find(class_='dateTime').get('data-utc'))
				post_time = datetime.datetime.utcfromtimestamp(post_time)
				post_time = post_time.replace(tzinfo=datetime.timezone.utc)
				file_url = None
				file_name = None
				file_md5 = None
				file_text = post.find(class_='fileText')
				if file_text:
					file_url = file_text.a.get('href')
					file_name = file_text.a.text
					file_md5 =post.find(class_='fileThumb').img.get('data-md5')
				post_body = post.find(class_='postMessage').get_text('\n')

				links = post.find_all(class_='quotelink')
				links = [l for l in links if l.get('href').startswith('#')]
				links = [int(link.text[2:]) for link in links]
				links = list(set(links))

				# heuristics
				tags = []
				if name in QM_NAMES:
					tags.append('qm_post')
					# also counts shitposters and broken tripcodes
					tags.append('story_post')
					# assume every QM post is also a story post, until
					# proven otherwise
					if "dropped my trip" in post_body.lower():
						for link in links:
							cur.execute("INSERT INTO tag "
								"VALUES (%s,%s), (%s,%s), (%s,%s)",
								(link, 'qm_post',
									link, 'story_post',
									link, 'dropped_trip'
								)
							)
							# dropped trip doesn't necessarily mean story_post
						tags.append('dropped_trip')
					if len(links) > 1:
						tags.append('vote_tally_post')
						# also counts Q&A posts
						try:
							tags.remove('story_post')
						except ValueError:
							pass
					if posts.index(post) == 0:
						tags.append('op_post')
						if "Welcome to Banished Quest!" in post_body:
							try:
								tags.remove('story_post')
							except ValueError:
								pass

				# database insert
				cur.execute(
					"INSERT INTO post VALUES (%s,%s,%s,%s,%s,%s,%s)",
					(thread_id, post_id, name, tripcode, subject,
					post_time, post_body)
				)
				for link in links:
					cur.execute("INSERT INTO link VALUES (%s,%s)",
						(post_id, link)
					)
				for tag in tags:
					cur.execute("INSERT INTO tag VALUES (%s,%s)",
						(post_id, tag)
					)
				if file_text:
					cur.execute("INSERT INTO file VALUES (%s,%s,%s)",
						(file_url, file_name, file_md5)
					)


if __name__ == '__main__':
	init_db()
	scrape_posts(os.path.join(BQ_DIR, 'archive'))
	scrape_posts(os.path.join(BQ_DIR, 'qstarchive'))
	con.commit()
	con.close()
first commit 2019-12-06 18:13:43 -05:00			`#!/usr/bin/env python3`
			`"""`
			`Extracts post data from a quest archive.`
			`"""`
			`import os`
			`import re`
			`import datetime`

			`import bs4`
			`import psycopg2`

			`import config`

			`BQ_DIR = '/var/www/html/banished/'`
			`QM_NAMES = ['Soma', 'Somas']`

			`con = psycopg2.connect(**config.db)`
			`cur = con.cursor()`

			`def init_db():`
			`"""Initializes the database if it hasn't been already."""`
			`with open('voyage.sql', 'r') as file:`
			`cur.execute(file.read())`


			`def scrape_posts(root_dir):`
			"""Walks the `root_dir` and extracts data from index.html files found."""
			`for dir_name, sub_dirs, files in os.walk(root_dir):`
			`for file in files:`
			`if file != 'index.html':`
			`continue`
			`filepath = os.path.join(root_dir, dir_name, file)`
			`print("Processing:", filepath)`
			`with open(filepath, 'r') as file:`
			`soup = bs4.BeautifulSoup(file.read(), 'html.parser')`

			`mobiles = soup.find_all(class_='mobile')`
			`for tag in mobiles:`
			`tag.decompose()`

			`thread_id = int(soup.find(class_='thread').get('id')[1:])`
			`thread_title = soup.find(class_='subject').text`
			`thread_time = int(soup.find(class_='dateTime').get('data-utc'))`
			`thread_time = datetime.datetime.utcfromtimestamp(thread_time)`
			`thread_time = thread_time.replace(tzinfo=datetime.timezone.utc)`
			`cur.execute("INSERT INTO thread VALUES (%s,%s,%s)",`
			`(thread_id, thread_title, thread_time)`
			`)`
			`posts = soup.find_all(class_='postContainer')`
			`for post in posts:`
			`# information gathering`
			`post_id = int(post.get('id')[2:])`
			`name = post.find(class_='name').text`
third commit 2019-12-11 19:47:19 -05:00			`tripcode = post.find(class_='postertrip')`
			`if tripcode:`
			`tripcode = tripcode.text`
first commit 2019-12-06 18:13:43 -05:00			`subject = post.find(class_='subject')`
			`if subject:`
			`subject = subject.text`
			`post_time = int(post.find(class_='dateTime').get('data-utc'))`
			`post_time = datetime.datetime.utcfromtimestamp(post_time)`
			`post_time = post_time.replace(tzinfo=datetime.timezone.utc)`
			`file_url = None`
			`file_name = None`
			`file_md5 = None`
			`file_text = post.find(class_='fileText')`
			`if file_text:`
			`file_url = file_text.a.get('href')`
			`file_name = file_text.a.text`
			`file_md5 =post.find(class_='fileThumb').img.get('data-md5')`
			`post_body = post.find(class_='postMessage').get_text('\n')`

			`links = post.find_all(class_='quotelink')`
			`links = [l for l in links if l.get('href').startswith('#')]`
			`links = [int(link.text[2:]) for link in links]`
third commit 2019-12-11 19:47:19 -05:00			`links = list(set(links))`
first commit 2019-12-06 18:13:43 -05:00
			`# heuristics`
			`tags = []`
			`if name in QM_NAMES:`
			`tags.append('qm_post')`
			`# also counts shitposters and broken tripcodes`
			`tags.append('story_post')`
			`# assume every QM post is also a story post, until`
			`# proven otherwise`
			`if "dropped my trip" in post_body.lower():`
			`for link in links:`
			`cur.execute("INSERT INTO tag "`
			`"VALUES (%s,%s), (%s,%s), (%s,%s)",`
			`(link, 'qm_post',`
			`link, 'story_post',`
			`link, 'dropped_trip'`
			`)`
			`)`
			`# dropped trip doesn't necessarily mean story_post`
			`tags.append('dropped_trip')`
			`if len(links) > 1:`
			`tags.append('vote_tally_post')`
			`# also counts Q&A posts`
			`try:`
			`tags.remove('story_post')`
			`except ValueError:`
			`pass`
			`if posts.index(post) == 0:`
			`tags.append('op_post')`
			`if "Welcome to Banished Quest!" in post_body:`
			`try:`
			`tags.remove('story_post')`
			`except ValueError:`
			`pass`

			`# database insert`
			`cur.execute(`
third commit 2019-12-11 19:47:19 -05:00			`"INSERT INTO post VALUES (%s,%s,%s,%s,%s,%s,%s)",`
			`(thread_id, post_id, name, tripcode, subject,`
			`post_time, post_body)`
first commit 2019-12-06 18:13:43 -05:00			`)`
			`for link in links:`
			`cur.execute("INSERT INTO link VALUES (%s,%s)",`
			`(post_id, link)`
			`)`
			`for tag in tags:`
			`cur.execute("INSERT INTO tag VALUES (%s,%s)",`
			`(post_id, tag)`
			`)`
third commit 2019-12-11 19:47:19 -05:00			`if file_text:`
			`cur.execute("INSERT INTO file VALUES (%s,%s,%s)",`
			`(file_url, file_name, file_md5)`
			`)`
first commit 2019-12-06 18:13:43 -05:00

			`if __name__ == '__main__':`
			`init_db()`
			`scrape_posts(os.path.join(BQ_DIR, 'archive'))`
			`scrape_posts(os.path.join(BQ_DIR, 'qstarchive'))`
			`con.commit()`
			`con.close()`