Voyage/fix_br.py

#!/usr/bin/env python3
"""
Fixes a mistake made to the newline structure in scrape_quest.py
"""
import os
import re
import datetime

import bs4
import psycopg2

import config

BQ_DIR = '/var/www/html/banished/'

con = psycopg2.connect(**config.db)
cur = con.cursor()

def scrape_posts(root_dir):
	"""Walks the `root_dir` and extracts data from index.html files found."""
	for dir_name, sub_dirs, files in os.walk(root_dir):
		for file in files:
			if file != 'index.html':
				continue
			filepath = os.path.join(root_dir, dir_name, file)
			print("Processing:", filepath)
			with open(filepath, 'r') as file:
				data = file.read()
			data = re.sub(r'<wbr ?\/?>', '', data)
			soup = bs4.BeautifulSoup(data, 'html.parser')
			
			thread_id = int(soup.find(class_='thread').get('id')[1:])

			posts = soup.find_all(class_='postContainer')
			for post in posts:
				post_id = int(post.get('id')[2:])

				post_body = post.find(class_='postMessage')
				for br in post_body.find_all('br'):
					br.replace_with('\n')
				post_body_txt = post_body.get_text()

				cur.execute(
					"UPDATE post SET body = (%s) WHERE id = (%s)",
					(post_body_txt, post_id)
				)

if __name__ == '__main__':
	scrape_posts(os.path.join(BQ_DIR, 'archive'))
	scrape_posts(os.path.join(BQ_DIR, 'qstarchive'))
	con.commit()
	con.close()
fix br 2022-08-27 00:10:37 -04:00			`#!/usr/bin/env python3`
			`"""`
			`Fixes a mistake made to the newline structure in scrape_quest.py`
			`"""`
			`import os`
			`import re`
			`import datetime`

			`import bs4`
			`import psycopg2`

			`import config`

			`BQ_DIR = '/var/www/html/banished/'`

			`con = psycopg2.connect(**config.db)`
			`cur = con.cursor()`

			`def scrape_posts(root_dir):`
			"""Walks the `root_dir` and extracts data from index.html files found."""
			`for dir_name, sub_dirs, files in os.walk(root_dir):`
			`for file in files:`
			`if file != 'index.html':`
			`continue`
			`filepath = os.path.join(root_dir, dir_name, file)`
			`print("Processing:", filepath)`
			`with open(filepath, 'r') as file:`
			`data = file.read()`
			`data = re.sub(r'<wbr ?\/?>', '', data)`
			`soup = bs4.BeautifulSoup(data, 'html.parser')`

			`thread_id = int(soup.find(class_='thread').get('id')[1:])`

			`posts = soup.find_all(class_='postContainer')`
			`for post in posts:`
			`post_id = int(post.get('id')[2:])`

			`post_body = post.find(class_='postMessage')`
			`for br in post_body.find_all('br'):`
			`br.replace_with('\n')`
			`post_body_txt = post_body.get_text()`

			`cur.execute(`
			`"UPDATE post SET body = (%s) WHERE id = (%s)",`
			`(post_body_txt, post_id)`
			`)`

			`if __name__ == '__main__':`
			`scrape_posts(os.path.join(BQ_DIR, 'archive'))`
			`scrape_posts(os.path.join(BQ_DIR, 'qstarchive'))`
			`con.commit()`
			`con.close()`