From 022e35e13ca24bac7da94527c0b4354d9df23451 Mon Sep 17 00:00:00 2001
From: iou1name <iou1name@national.shitposting.agency>
Date: Sat, 27 Aug 2022 00:10:37 -0400
Subject: [PATCH] fix br

---
 fix_br.py | 52 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 52 insertions(+)
 create mode 100644 fix_br.py
diff --git a/fix_br.py b/fix_br.py
new file mode 100644
index 0000000..5e8ca78
--- /dev/null
+++ b/fix_br.py
@@ -0,0 +1,52 @@
+#!/usr/bin/env python3
+"""
+Fixes a mistake made to the newline structure in scrape_quest.py
+"""
+import os
+import re
+import datetime
+
+import bs4
+import psycopg2
+
+import config
+
+BQ_DIR = '/var/www/html/banished/'
+
+con = psycopg2.connect(**config.db)
+cur = con.cursor()
+
+def scrape_posts(root_dir):
+	"""Walks the `root_dir` and extracts data from index.html files found."""
+	for dir_name, sub_dirs, files in os.walk(root_dir):
+		for file in files:
+			if file != 'index.html':
+				continue
+			filepath = os.path.join(root_dir, dir_name, file)
+			print("Processing:", filepath)
+			with open(filepath, 'r') as file:
+				data = file.read()
+			data = re.sub(r'<wbr ?\/?>', '', data)
+			soup = bs4.BeautifulSoup(data, 'html.parser')
+			
+			thread_id = int(soup.find(class_='thread').get('id')[1:])
+
+			posts = soup.find_all(class_='postContainer')
+			for post in posts:
+				post_id = int(post.get('id')[2:])
+
+				post_body = post.find(class_='postMessage')
+				for br in post_body.find_all('br'):
+					br.replace_with('\n')
+				post_body_txt = post_body.get_text()
+
+				cur.execute(
+					"UPDATE post SET body = (%s) WHERE id = (%s)",
+					(post_body_txt, post_id)
+				)
+
+if __name__ == '__main__':
+	scrape_posts(os.path.join(BQ_DIR, 'archive'))
+	scrape_posts(os.path.join(BQ_DIR, 'qstarchive'))
+	con.commit()
+	con.close()