diff --git a/fix_br.py b/fix_br.py new file mode 100644 index 0000000..5e8ca78 --- /dev/null +++ b/fix_br.py @@ -0,0 +1,52 @@ +#!/usr/bin/env python3 +""" +Fixes a mistake made to the newline structure in scrape_quest.py +""" +import os +import re +import datetime + +import bs4 +import psycopg2 + +import config + +BQ_DIR = '/var/www/html/banished/' + +con = psycopg2.connect(**config.db) +cur = con.cursor() + +def scrape_posts(root_dir): + """Walks the `root_dir` and extracts data from index.html files found.""" + for dir_name, sub_dirs, files in os.walk(root_dir): + for file in files: + if file != 'index.html': + continue + filepath = os.path.join(root_dir, dir_name, file) + print("Processing:", filepath) + with open(filepath, 'r') as file: + data = file.read() + data = re.sub(r'', '', data) + soup = bs4.BeautifulSoup(data, 'html.parser') + + thread_id = int(soup.find(class_='thread').get('id')[1:]) + + posts = soup.find_all(class_='postContainer') + for post in posts: + post_id = int(post.get('id')[2:]) + + post_body = post.find(class_='postMessage') + for br in post_body.find_all('br'): + br.replace_with('\n') + post_body_txt = post_body.get_text() + + cur.execute( + "UPDATE post SET body = (%s) WHERE id = (%s)", + (post_body_txt, post_id) + ) + +if __name__ == '__main__': + scrape_posts(os.path.join(BQ_DIR, 'archive')) + scrape_posts(os.path.join(BQ_DIR, 'qstarchive')) + con.commit() + con.close()