Voyage/scrape_quest.py
2020-01-20 13:18:47 -05:00

157 lines
4.7 KiB
Python

#!/usr/bin/env python3
"""
Extracts post data from a quest archive.
"""
import os
import re
import datetime
import bs4
import psycopg2
import config
BQ_DIR = '/var/www/html/banished/'
QM_NAMES = ['Soma', 'Somas']
con = psycopg2.connect(**config.db)
cur = con.cursor()
def init_db():
"""Initializes the database if it hasn't been already."""
with open('voyage.sql', 'r') as file:
cur.execute(file.read())
def scrape_posts(root_dir):
"""Walks the `root_dir` and extracts data from index.html files found."""
for dir_name, sub_dirs, files in os.walk(root_dir):
for file in files:
if file != 'index.html':
continue
filepath = os.path.join(root_dir, dir_name, file)
print("Processing:", filepath)
with open(filepath, 'r') as file:
data = file.read()
data = re.sub(r'<wbr ?\/?>', '', data)
soup = bs4.BeautifulSoup(data, 'html.parser')
mobiles = soup.find_all(class_='mobile')
for tag in mobiles:
tag.decompose()
thread_id = int(soup.find(class_='thread').get('id')[1:])
thread_title = soup.find(class_='subject').text
thread_time = int(soup.find(class_='dateTime').get('data-utc'))
thread_time = datetime.datetime.utcfromtimestamp(thread_time)
thread_time = thread_time.replace(tzinfo=datetime.timezone.utc)
cur.execute("INSERT INTO thread VALUES (%s,%s,%s)",
(thread_id, thread_title, thread_time)
)
posts = soup.find_all(class_='postContainer')
for post in posts:
# information gathering
post_id = int(post.get('id')[2:])
subject = post.find(class_='subject')
if subject:
subject = subject.text
name = post.find(class_='name').text
user_email = post.find(class_='useremail')
if user_email:
user_email = user_email.get('href')
tripcode = post.find(class_='postertrip')
if tripcode:
tripcode = tripcode.text
user_id = post.find(class_='hand')
if user_id:
user_id = user_id.text
post_time = int(post.find(class_='dateTime').get('data-utc'))
post_time = datetime.datetime.utcfromtimestamp(post_time)
post_time = post_time.replace(tzinfo=datetime.timezone.utc)
chan_file_name = None
file_name = None
file_text = post.find(class_='fileText')
if file_text:
chan_file_name =file_text.a.get('href').rpartition('/')[2]
original_file_name = file_text.a.text
post_body = post.find(class_='postMessage').get_text('\n')
links = post.find_all(class_='quotelink')
links = [l for l in links if l.get('href').startswith('#')]
links = [int(link.text[2:]) for link in links]
links = list(set(links))
# heuristics
tags = set()
if name in QM_NAMES:
tags.add('qm_post')
# also counts shitposters and broken tripcodes
tags.add('story_post')
# assume every QM post is also a story post, until
# proven otherwise
if "dropped my trip" in post_body.lower():
for link in links:
cur.execute("INSERT INTO tag "
"VALUES (%s,%s), (%s,%s), (%s,%s)",
(link, 'qm_post',
link, 'story_post',
link, 'dropped_trip'
)
)
# dropped trip doesn't necessarily mean story_post
tags.add('dropped_trip')
if len(links) > 1:
tags.add('tally_post')
# also counts Q&A posts
if 'story_post' in tags:
tags.remove('story_post')
if 'writin' in post_body.lower():
tags.add('tally_post')
if 'story_post' in tags:
tags.remove('story_post')
if posts.index(post) == 0:
tags.add('op_post')
if "Welcome to Banished Quest!" in post_body:
if 'story_post' in tags:
tags.remove('story_post')
if re.search(r'ro+l+ me', post_body.lower()):
tags.add('dice_call')
if 'story_post' in tags:
tags.remove('story_post')
if re.search(r'roll .*3d10', post_body.lower()):
tags.add('dice_call')
if 'final destination' in post_body.lower():
tags.add('final_destination')
if 'story_post' in tags:
tags.remove('story_post')
if 'story_post' in tags:
if len(re.findall(r'\n>', post_body)) > 1:
tags.add('vote_choices')
# database insert
cur.execute(
"INSERT INTO post VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s)",
(thread_id, post_id, subject, name, user_email, tripcode,
user_id, post_time, post_body)
)
for link in links:
cur.execute("INSERT INTO link VALUES (%s,%s)",
(post_id, link)
)
for tag in tags:
cur.execute("INSERT INTO tag VALUES (%s,%s)",
(post_id, tag)
)
if file_text:
cur.execute("INSERT INTO file VALUES (%s,%s,%s)",
(post_id, chan_file_name, original_file_name)
)
if __name__ == '__main__':
init_db()
scrape_posts(os.path.join(BQ_DIR, 'archive'))
scrape_posts(os.path.join(BQ_DIR, 'qstarchive'))
con.commit()
con.close()