Voyage/scrape_quest.py

138 lines
3.9 KiB
Python
Raw Normal View History

2019-12-06 18:13:43 -05:00
#!/usr/bin/env python3
"""
Extracts post data from a quest archive.
"""
import os
import re
import datetime
import bs4
import psycopg2
import config
BQ_DIR = '/var/www/html/banished/'
QM_NAMES = ['Soma', 'Somas']
con = psycopg2.connect(**config.db)
cur = con.cursor()
def init_db():
"""Initializes the database if it hasn't been already."""
with open('voyage.sql', 'r') as file:
cur.execute(file.read())
def scrape_posts(root_dir):
"""Walks the `root_dir` and extracts data from index.html files found."""
for dir_name, sub_dirs, files in os.walk(root_dir):
for file in files:
if file != 'index.html':
continue
filepath = os.path.join(root_dir, dir_name, file)
print("Processing:", filepath)
with open(filepath, 'r') as file:
soup = bs4.BeautifulSoup(file.read(), 'html.parser')
mobiles = soup.find_all(class_='mobile')
for tag in mobiles:
tag.decompose()
thread_id = int(soup.find(class_='thread').get('id')[1:])
thread_title = soup.find(class_='subject').text
thread_time = int(soup.find(class_='dateTime').get('data-utc'))
thread_time = datetime.datetime.utcfromtimestamp(thread_time)
thread_time = thread_time.replace(tzinfo=datetime.timezone.utc)
cur.execute("INSERT INTO thread VALUES (%s,%s,%s)",
(thread_id, thread_title, thread_time)
)
posts = soup.find_all(class_='postContainer')
for post in posts:
# information gathering
post_id = int(post.get('id')[2:])
name = post.find(class_='name').text
2019-12-11 19:47:19 -05:00
tripcode = post.find(class_='postertrip')
if tripcode:
tripcode = tripcode.text
2019-12-06 18:13:43 -05:00
subject = post.find(class_='subject')
if subject:
subject = subject.text
post_time = int(post.find(class_='dateTime').get('data-utc'))
post_time = datetime.datetime.utcfromtimestamp(post_time)
post_time = post_time.replace(tzinfo=datetime.timezone.utc)
file_url = None
file_name = None
file_md5 = None
file_text = post.find(class_='fileText')
if file_text:
file_url = file_text.a.get('href')
file_name = file_text.a.text
file_md5 =post.find(class_='fileThumb').img.get('data-md5')
post_body = post.find(class_='postMessage').get_text('\n')
links = post.find_all(class_='quotelink')
links = [l for l in links if l.get('href').startswith('#')]
links = [int(link.text[2:]) for link in links]
2019-12-11 19:47:19 -05:00
links = list(set(links))
2019-12-06 18:13:43 -05:00
# heuristics
tags = []
if name in QM_NAMES:
tags.append('qm_post')
# also counts shitposters and broken tripcodes
tags.append('story_post')
# assume every QM post is also a story post, until
# proven otherwise
if "dropped my trip" in post_body.lower():
for link in links:
cur.execute("INSERT INTO tag "
"VALUES (%s,%s), (%s,%s), (%s,%s)",
(link, 'qm_post',
link, 'story_post',
link, 'dropped_trip'
)
)
# dropped trip doesn't necessarily mean story_post
tags.append('dropped_trip')
if len(links) > 1:
tags.append('vote_tally_post')
# also counts Q&A posts
try:
tags.remove('story_post')
except ValueError:
pass
if posts.index(post) == 0:
tags.append('op_post')
if "Welcome to Banished Quest!" in post_body:
try:
tags.remove('story_post')
except ValueError:
pass
# database insert
cur.execute(
2019-12-11 19:47:19 -05:00
"INSERT INTO post VALUES (%s,%s,%s,%s,%s,%s,%s)",
(thread_id, post_id, name, tripcode, subject,
post_time, post_body)
2019-12-06 18:13:43 -05:00
)
for link in links:
cur.execute("INSERT INTO link VALUES (%s,%s)",
(post_id, link)
)
for tag in tags:
cur.execute("INSERT INTO tag VALUES (%s,%s)",
(post_id, tag)
)
2019-12-11 19:47:19 -05:00
if file_text:
cur.execute("INSERT INTO file VALUES (%s,%s,%s)",
(file_url, file_name, file_md5)
)
2019-12-06 18:13:43 -05:00
if __name__ == '__main__':
init_db()
scrape_posts(os.path.join(BQ_DIR, 'archive'))
scrape_posts(os.path.join(BQ_DIR, 'qstarchive'))
con.commit()
con.close()