Voyage/scrape_quest.py

151 lines
4.5 KiB
Python
Raw Normal View History

2019-12-06 18:13:43 -05:00
#!/usr/bin/env python3
"""
Extracts post data from a quest archive.
"""
import os
import re
import datetime
import bs4
import psycopg2
import config
BQ_DIR = '/var/www/html/banished/'
QM_NAMES = ['Soma', 'Somas']
con = psycopg2.connect(**config.db)
cur = con.cursor()
def init_db():
"""Initializes the database if it hasn't been already."""
with open('voyage.sql', 'r') as file:
cur.execute(file.read())
def scrape_posts(root_dir):
"""Walks the `root_dir` and extracts data from index.html files found."""
for dir_name, sub_dirs, files in os.walk(root_dir):
for file in files:
if file != 'index.html':
continue
filepath = os.path.join(root_dir, dir_name, file)
print("Processing:", filepath)
with open(filepath, 'r') as file:
2019-12-20 15:11:21 -05:00
data = file.read()
data = re.sub(r'<wbr ?\/?>', '', data)
soup = bs4.BeautifulSoup(data, 'html.parser')
2019-12-06 18:13:43 -05:00
mobiles = soup.find_all(class_='mobile')
for tag in mobiles:
tag.decompose()
thread_id = int(soup.find(class_='thread').get('id')[1:])
thread_title = soup.find(class_='subject').text
thread_time = int(soup.find(class_='dateTime').get('data-utc'))
thread_time = datetime.datetime.utcfromtimestamp(thread_time)
thread_time = thread_time.replace(tzinfo=datetime.timezone.utc)
cur.execute("INSERT INTO thread VALUES (%s,%s,%s)",
(thread_id, thread_title, thread_time)
)
posts = soup.find_all(class_='postContainer')
for post in posts:
# information gathering
post_id = int(post.get('id')[2:])
name = post.find(class_='name').text
2019-12-11 19:47:19 -05:00
tripcode = post.find(class_='postertrip')
if tripcode:
tripcode = tripcode.text
2019-12-06 18:13:43 -05:00
subject = post.find(class_='subject')
if subject:
subject = subject.text
post_time = int(post.find(class_='dateTime').get('data-utc'))
post_time = datetime.datetime.utcfromtimestamp(post_time)
post_time = post_time.replace(tzinfo=datetime.timezone.utc)
2019-12-17 18:46:54 -05:00
chan_file_name = None
2019-12-06 18:13:43 -05:00
file_name = None
file_text = post.find(class_='fileText')
if file_text:
2019-12-17 18:46:54 -05:00
chan_file_name =file_text.a.get('href').rpartition('/')[2]
original_file_name = file_text.a.text
2019-12-06 18:13:43 -05:00
post_body = post.find(class_='postMessage').get_text('\n')
links = post.find_all(class_='quotelink')
links = [l for l in links if l.get('href').startswith('#')]
links = [int(link.text[2:]) for link in links]
2019-12-11 19:47:19 -05:00
links = list(set(links))
2019-12-06 18:13:43 -05:00
# heuristics
2019-12-20 15:11:21 -05:00
tags = set()
2019-12-06 18:13:43 -05:00
if name in QM_NAMES:
2019-12-20 15:11:21 -05:00
tags.add('qm_post')
2019-12-06 18:13:43 -05:00
# also counts shitposters and broken tripcodes
2019-12-20 15:11:21 -05:00
tags.add('story_post')
2019-12-06 18:13:43 -05:00
# assume every QM post is also a story post, until
# proven otherwise
if "dropped my trip" in post_body.lower():
for link in links:
cur.execute("INSERT INTO tag "
"VALUES (%s,%s), (%s,%s), (%s,%s)",
(link, 'qm_post',
link, 'story_post',
link, 'dropped_trip'
)
)
# dropped trip doesn't necessarily mean story_post
2019-12-20 15:11:21 -05:00
tags.add('dropped_trip')
2019-12-06 18:13:43 -05:00
if len(links) > 1:
2019-12-20 15:11:21 -05:00
tags.add('tally_post')
2019-12-06 18:13:43 -05:00
# also counts Q&A posts
2019-12-17 18:46:54 -05:00
if 'story_post' in tags:
2019-12-06 18:13:43 -05:00
tags.remove('story_post')
2019-12-20 15:11:21 -05:00
if 'writin' in post_body.lower():
tags.add('tally_post')
if 'story_post' in tags:
tags.remove('story_post')
2019-12-06 18:13:43 -05:00
if posts.index(post) == 0:
2019-12-20 15:11:21 -05:00
tags.add('op_post')
2019-12-06 18:13:43 -05:00
if "Welcome to Banished Quest!" in post_body:
2019-12-17 18:46:54 -05:00
if 'story_post' in tags:
2019-12-06 18:13:43 -05:00
tags.remove('story_post')
2019-12-17 18:46:54 -05:00
if re.search(r'ro+l+ me', post_body.lower()):
2019-12-20 15:11:21 -05:00
tags.add('dice_call')
2019-12-17 18:46:54 -05:00
if 'story_post' in tags:
tags.remove('story_post')
2019-12-20 15:11:21 -05:00
if re.search(r'roll .*3d10', post_body.lower()):
tags.add('dice_call')
2019-12-17 18:46:54 -05:00
if 'final destination' in post_body.lower():
2019-12-20 15:11:21 -05:00
tags.add('final_destination')
2019-12-17 18:46:54 -05:00
if 'story_post' in tags:
tags.remove('story_post')
if 'story_post' in tags:
if len(re.findall(r'\n>', post_body)) > 1:
2019-12-20 15:11:21 -05:00
tags.add('vote_choices')
2019-12-06 18:13:43 -05:00
# database insert
cur.execute(
2019-12-11 19:47:19 -05:00
"INSERT INTO post VALUES (%s,%s,%s,%s,%s,%s,%s)",
(thread_id, post_id, name, tripcode, subject,
post_time, post_body)
2019-12-06 18:13:43 -05:00
)
for link in links:
cur.execute("INSERT INTO link VALUES (%s,%s)",
(post_id, link)
)
for tag in tags:
cur.execute("INSERT INTO tag VALUES (%s,%s)",
(post_id, tag)
)
2019-12-11 19:47:19 -05:00
if file_text:
cur.execute("INSERT INTO file VALUES (%s,%s,%s)",
2019-12-17 18:46:54 -05:00
(post_id, chan_file_name, original_file_name)
2019-12-11 19:47:19 -05:00
)
2019-12-06 18:13:43 -05:00
if __name__ == '__main__':
init_db()
scrape_posts(os.path.join(BQ_DIR, 'archive'))
scrape_posts(os.path.join(BQ_DIR, 'qstarchive'))
con.commit()
con.close()