#!/usr/bin/env python3 """ Extracts post data from a quest archive. """ import os import re import datetime import bs4 import psycopg2 import config BQ_DIR = '/var/www/html/banished/' QM_NAMES = ['Soma', 'Somas'] con = psycopg2.connect(**config.db) cur = con.cursor() def init_db(): """Initializes the database if it hasn't been already.""" with open('voyage.sql', 'r') as file: cur.execute(file.read()) def scrape_posts(root_dir): """Walks the `root_dir` and extracts data from index.html files found.""" for dir_name, sub_dirs, files in os.walk(root_dir): for file in files: if file != 'index.html': continue filepath = os.path.join(root_dir, dir_name, file) print("Processing:", filepath) with open(filepath, 'r') as file: data = file.read() data = re.sub(r'', '', data) soup = bs4.BeautifulSoup(data, 'html.parser') mobiles = soup.find_all(class_='mobile') for tag in mobiles: tag.decompose() thread_id = int(soup.find(class_='thread').get('id')[1:]) thread_title = soup.find(class_='subject').text thread_time = int(soup.find(class_='dateTime').get('data-utc')) thread_time = datetime.datetime.utcfromtimestamp(thread_time) thread_time = thread_time.replace(tzinfo=datetime.timezone.utc) cur.execute("INSERT INTO thread VALUES (%s,%s,%s)", (thread_id, thread_title, thread_time) ) posts = soup.find_all(class_='postContainer') for post in posts: # information gathering post_id = int(post.get('id')[2:]) name = post.find(class_='name').text tripcode = post.find(class_='postertrip') if tripcode: tripcode = tripcode.text subject = post.find(class_='subject') if subject: subject = subject.text post_time = int(post.find(class_='dateTime').get('data-utc')) post_time = datetime.datetime.utcfromtimestamp(post_time) post_time = post_time.replace(tzinfo=datetime.timezone.utc) chan_file_name = None file_name = None file_text = post.find(class_='fileText') if file_text: chan_file_name =file_text.a.get('href').rpartition('/')[2] original_file_name = file_text.a.text post_body = post.find(class_='postMessage').get_text('\n') links = post.find_all(class_='quotelink') links = [l for l in links if l.get('href').startswith('#')] links = [int(link.text[2:]) for link in links] links = list(set(links)) # heuristics tags = set() if name in QM_NAMES: tags.add('qm_post') # also counts shitposters and broken tripcodes tags.add('story_post') # assume every QM post is also a story post, until # proven otherwise if "dropped my trip" in post_body.lower(): for link in links: cur.execute("INSERT INTO tag " "VALUES (%s,%s), (%s,%s), (%s,%s)", (link, 'qm_post', link, 'story_post', link, 'dropped_trip' ) ) # dropped trip doesn't necessarily mean story_post tags.add('dropped_trip') if len(links) > 1: tags.add('tally_post') # also counts Q&A posts if 'story_post' in tags: tags.remove('story_post') if 'writin' in post_body.lower(): tags.add('tally_post') if 'story_post' in tags: tags.remove('story_post') if posts.index(post) == 0: tags.add('op_post') if "Welcome to Banished Quest!" in post_body: if 'story_post' in tags: tags.remove('story_post') if re.search(r'ro+l+ me', post_body.lower()): tags.add('dice_call') if 'story_post' in tags: tags.remove('story_post') if re.search(r'roll .*3d10', post_body.lower()): tags.add('dice_call') if 'final destination' in post_body.lower(): tags.add('final_destination') if 'story_post' in tags: tags.remove('story_post') if 'story_post' in tags: if len(re.findall(r'\n>', post_body)) > 1: tags.add('vote_choices') # database insert cur.execute( "INSERT INTO post VALUES (%s,%s,%s,%s,%s,%s,%s)", (thread_id, post_id, name, tripcode, subject, post_time, post_body) ) for link in links: cur.execute("INSERT INTO link VALUES (%s,%s)", (post_id, link) ) for tag in tags: cur.execute("INSERT INTO tag VALUES (%s,%s)", (post_id, tag) ) if file_text: cur.execute("INSERT INTO file VALUES (%s,%s,%s)", (post_id, chan_file_name, original_file_name) ) if __name__ == '__main__': init_db() scrape_posts(os.path.join(BQ_DIR, 'archive')) scrape_posts(os.path.join(BQ_DIR, 'qstarchive')) con.commit() con.close()