#!/usr/bin/env python3 """ Fixes a mistake made to the newline structure in scrape_quest.py """ import os import re import datetime import bs4 import psycopg2 import config BQ_DIR = '/var/www/html/banished/' con = psycopg2.connect(**config.db) cur = con.cursor() def scrape_posts(root_dir): """Walks the `root_dir` and extracts data from index.html files found.""" for dir_name, sub_dirs, files in os.walk(root_dir): for file in files: if file != 'index.html': continue filepath = os.path.join(root_dir, dir_name, file) print("Processing:", filepath) with open(filepath, 'r') as file: data = file.read() data = re.sub(r'', '', data) soup = bs4.BeautifulSoup(data, 'html.parser') thread_id = int(soup.find(class_='thread').get('id')[1:]) posts = soup.find_all(class_='postContainer') for post in posts: post_id = int(post.get('id')[2:]) post_body = post.find(class_='postMessage') for br in post_body.find_all('br'): br.replace_with('\n') post_body_txt = post_body.get_text() cur.execute( "UPDATE post SET body = (%s) WHERE id = (%s)", (post_body_txt, post_id) ) if __name__ == '__main__': scrape_posts(os.path.join(BQ_DIR, 'archive')) scrape_posts(os.path.join(BQ_DIR, 'qstarchive')) con.commit() con.close()