#!/usr/bin/env python3 """ Extracts post data from a quest archive. """ import os import re import datetime import bs4 import psycopg2 import config BQ_DIR = '/var/www/html/banished/' QM_NAMES = ['Soma', 'Somas'] con = psycopg2.connect(**config.db) cur = con.cursor() def init_db(): """Initializes the database if it hasn't been already.""" with open('voyage.sql', 'r') as file: cur.execute(file.read()) def scrape_posts(root_dir): """Walks the `root_dir` and extracts data from index.html files found.""" for dir_name, sub_dirs, files in os.walk(root_dir): for file in files: if file != 'index.html': continue filepath = os.path.join(root_dir, dir_name, file) print("Processing:", filepath) with open(filepath, 'r') as file: soup = bs4.BeautifulSoup(file.read(), 'html.parser') mobiles = soup.find_all(class_='mobile') for tag in mobiles: tag.decompose() thread_id = int(soup.find(class_='thread').get('id')[1:]) thread_title = soup.find(class_='subject').text thread_time = int(soup.find(class_='dateTime').get('data-utc')) thread_time = datetime.datetime.utcfromtimestamp(thread_time) thread_time = thread_time.replace(tzinfo=datetime.timezone.utc) cur.execute("INSERT INTO thread VALUES (%s,%s,%s)", (thread_id, thread_title, thread_time) ) posts = soup.find_all(class_='postContainer') for post in posts: # information gathering post_id = int(post.get('id')[2:]) name = post.find(class_='name').text trip_code = post.find(class_='postertrip') if trip_code: trip_code = trip_code.text subject = post.find(class_='subject') if subject: subject = subject.text post_time = int(post.find(class_='dateTime').get('data-utc')) post_time = datetime.datetime.utcfromtimestamp(post_time) post_time = post_time.replace(tzinfo=datetime.timezone.utc) file_url = None file_name = None file_md5 = None file_text = post.find(class_='fileText') if file_text: file_url = file_text.a.get('href') file_name = file_text.a.text file_md5 =post.find(class_='fileThumb').img.get('data-md5') post_body = post.find(class_='postMessage').get_text('\n') links = post.find_all(class_='quotelink') links = [l for l in links if l.get('href').startswith('#')] links = [int(link.text[2:]) for link in links] # heuristics tags = [] if name in QM_NAMES: tags.append('qm_post') # also counts shitposters and broken tripcodes tags.append('story_post') # assume every QM post is also a story post, until # proven otherwise if "dropped my trip" in post_body.lower(): for link in links: cur.execute("INSERT INTO tag " "VALUES (%s,%s), (%s,%s), (%s,%s)", (link, 'qm_post', link, 'story_post', link, 'dropped_trip' ) ) # dropped trip doesn't necessarily mean story_post tags.append('dropped_trip') if len(links) > 1: tags.append('vote_tally_post') # also counts Q&A posts try: tags.remove('story_post') except ValueError: pass if posts.index(post) == 0: tags.append('op_post') if "Welcome to Banished Quest!" in post_body: try: tags.remove('story_post') except ValueError: pass # database insert cur.execute( "INSERT INTO post VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)", (thread_id, post_id, name, trip_code, subject, post_time, file_url, file_name, file_md5, post_body) ) for link in links: cur.execute("INSERT INTO link VALUES (%s,%s)", (post_id, link) ) for tag in tags: cur.execute("INSERT INTO tag VALUES (%s,%s)", (post_id, tag) ) if __name__ == '__main__': init_db() scrape_posts(os.path.join(BQ_DIR, 'archive')) scrape_posts(os.path.join(BQ_DIR, 'qstarchive')) con.commit() con.close()