Voyage/fix_br.py
2022-09-01 12:41:22 -04:00

72 lines
2.2 KiB
Python

#!/usr/bin/env python3
"""
Fixes a mistake made to the newline structure in scrape_quest.py
"""
import os
import re
import datetime
import bs4
import psycopg2
import config
BQ_DIR = '/var/www/html/banished/'
con = psycopg2.connect(**config.db)
cur = con.cursor()
def scrape_posts(root_dir):
"""Walks the `root_dir` and extracts data from index.html files found."""
for dir_name, sub_dirs, files in os.walk(root_dir):
for file in files:
if file != 'index.html':
continue
filepath = os.path.join(root_dir, dir_name, file)
print("Processing:", filepath)
with open(filepath, 'r') as file:
data = file.read()
data = re.sub(r'<wbr ?\/?>', '', data)
soup = bs4.BeautifulSoup(data, 'html.parser')
thread_id = int(soup.find(class_='thread').get('id')[1:])
posts = soup.find_all(class_='postContainer')
for post in posts:
post_id = int(post.get('id')[2:])
post_body = post.find(class_='postMessage')
for br in post_body.find_all('br'):
br.replace_with('\n')
#post_body_txt = post_body.get_text()
post_body_txt = ''
for child in post_body.children:
if not child.name: # text element
post_body_txt += child.get_text()
elif child.name == 'b':
post_body_txt += '<b>' + child.get_text() + '</b>'
elif child.name == 'span' and child['class'][0] == 'mu-s':
post_body_txt += '<span class="bold">' + child.get_text() + '</span>'
elif child.name == 'span' and child['class'][0] == 'mu-i':
post_body_txt += '<span class="italic">' + child.get_text() + '</span>'
elif child.name == 'span' and child['class'][0] == 'mu-r':
post_body_txt += '<span class="red">' + child.get_text() + '</span>'
elif child.name == 'span' and child['class'][0] == 'mu-g':
post_body_txt += '<span class="green">' + child.get_text() + '</span>'
elif child.name == 'span' and child['class'][0] == 'mu-b':
post_body_txt += '<span class="blue">' + child.get_text() + '</span>'
else:
post_body_txt += child.get_text()
cur.execute(
"UPDATE post SET body = (%s) WHERE id = (%s)",
(post_body_txt, post_id)
)
if __name__ == '__main__':
scrape_posts(os.path.join(BQ_DIR, 'archive'))
scrape_posts(os.path.join(BQ_DIR, 'qstarchive'))
con.commit()
con.close()