151 lines
4.5 KiB
Python
151 lines
4.5 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Extracts post data from a quest archive.
|
|
"""
|
|
import os
|
|
import re
|
|
import datetime
|
|
|
|
import bs4
|
|
import psycopg2
|
|
|
|
import config
|
|
|
|
BQ_DIR = '/var/www/html/banished/'
|
|
QM_NAMES = ['Soma', 'Somas']
|
|
|
|
con = psycopg2.connect(**config.db)
|
|
cur = con.cursor()
|
|
|
|
def init_db():
|
|
"""Initializes the database if it hasn't been already."""
|
|
with open('voyage.sql', 'r') as file:
|
|
cur.execute(file.read())
|
|
|
|
|
|
def scrape_posts(root_dir):
|
|
"""Walks the `root_dir` and extracts data from index.html files found."""
|
|
for dir_name, sub_dirs, files in os.walk(root_dir):
|
|
for file in files:
|
|
if file != 'index.html':
|
|
continue
|
|
filepath = os.path.join(root_dir, dir_name, file)
|
|
print("Processing:", filepath)
|
|
with open(filepath, 'r') as file:
|
|
data = file.read()
|
|
data = re.sub(r'<wbr ?\/?>', '', data)
|
|
soup = bs4.BeautifulSoup(data, 'html.parser')
|
|
|
|
mobiles = soup.find_all(class_='mobile')
|
|
for tag in mobiles:
|
|
tag.decompose()
|
|
|
|
thread_id = int(soup.find(class_='thread').get('id')[1:])
|
|
thread_title = soup.find(class_='subject').text
|
|
thread_time = int(soup.find(class_='dateTime').get('data-utc'))
|
|
thread_time = datetime.datetime.utcfromtimestamp(thread_time)
|
|
thread_time = thread_time.replace(tzinfo=datetime.timezone.utc)
|
|
cur.execute("INSERT INTO thread VALUES (%s,%s,%s)",
|
|
(thread_id, thread_title, thread_time)
|
|
)
|
|
posts = soup.find_all(class_='postContainer')
|
|
for post in posts:
|
|
# information gathering
|
|
post_id = int(post.get('id')[2:])
|
|
name = post.find(class_='name').text
|
|
tripcode = post.find(class_='postertrip')
|
|
if tripcode:
|
|
tripcode = tripcode.text
|
|
subject = post.find(class_='subject')
|
|
if subject:
|
|
subject = subject.text
|
|
post_time = int(post.find(class_='dateTime').get('data-utc'))
|
|
post_time = datetime.datetime.utcfromtimestamp(post_time)
|
|
post_time = post_time.replace(tzinfo=datetime.timezone.utc)
|
|
chan_file_name = None
|
|
file_name = None
|
|
file_text = post.find(class_='fileText')
|
|
if file_text:
|
|
chan_file_name =file_text.a.get('href').rpartition('/')[2]
|
|
original_file_name = file_text.a.text
|
|
post_body = post.find(class_='postMessage').get_text('\n')
|
|
|
|
links = post.find_all(class_='quotelink')
|
|
links = [l for l in links if l.get('href').startswith('#')]
|
|
links = [int(link.text[2:]) for link in links]
|
|
links = list(set(links))
|
|
|
|
# heuristics
|
|
tags = set()
|
|
if name in QM_NAMES:
|
|
tags.add('qm_post')
|
|
# also counts shitposters and broken tripcodes
|
|
tags.add('story_post')
|
|
# assume every QM post is also a story post, until
|
|
# proven otherwise
|
|
if "dropped my trip" in post_body.lower():
|
|
for link in links:
|
|
cur.execute("INSERT INTO tag "
|
|
"VALUES (%s,%s), (%s,%s), (%s,%s)",
|
|
(link, 'qm_post',
|
|
link, 'story_post',
|
|
link, 'dropped_trip'
|
|
)
|
|
)
|
|
# dropped trip doesn't necessarily mean story_post
|
|
tags.add('dropped_trip')
|
|
if len(links) > 1:
|
|
tags.add('tally_post')
|
|
# also counts Q&A posts
|
|
if 'story_post' in tags:
|
|
tags.remove('story_post')
|
|
if 'writin' in post_body.lower():
|
|
tags.add('tally_post')
|
|
if 'story_post' in tags:
|
|
tags.remove('story_post')
|
|
if posts.index(post) == 0:
|
|
tags.add('op_post')
|
|
if "Welcome to Banished Quest!" in post_body:
|
|
if 'story_post' in tags:
|
|
tags.remove('story_post')
|
|
if re.search(r'ro+l+ me', post_body.lower()):
|
|
tags.add('dice_call')
|
|
if 'story_post' in tags:
|
|
tags.remove('story_post')
|
|
if re.search(r'roll .*3d10', post_body.lower()):
|
|
tags.add('dice_call')
|
|
if 'final destination' in post_body.lower():
|
|
tags.add('final_destination')
|
|
if 'story_post' in tags:
|
|
tags.remove('story_post')
|
|
if 'story_post' in tags:
|
|
if len(re.findall(r'\n>', post_body)) > 1:
|
|
tags.add('vote_choices')
|
|
|
|
# database insert
|
|
cur.execute(
|
|
"INSERT INTO post VALUES (%s,%s,%s,%s,%s,%s,%s)",
|
|
(thread_id, post_id, name, tripcode, subject,
|
|
post_time, post_body)
|
|
)
|
|
for link in links:
|
|
cur.execute("INSERT INTO link VALUES (%s,%s)",
|
|
(post_id, link)
|
|
)
|
|
for tag in tags:
|
|
cur.execute("INSERT INTO tag VALUES (%s,%s)",
|
|
(post_id, tag)
|
|
)
|
|
if file_text:
|
|
cur.execute("INSERT INTO file VALUES (%s,%s,%s)",
|
|
(post_id, chan_file_name, original_file_name)
|
|
)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
init_db()
|
|
scrape_posts(os.path.join(BQ_DIR, 'archive'))
|
|
scrape_posts(os.path.join(BQ_DIR, 'qstarchive'))
|
|
con.commit()
|
|
con.close()
|