53 lines
1.3 KiB
Python
53 lines
1.3 KiB
Python
|
#!/usr/bin/env python3
|
||
|
"""
|
||
|
Fixes a mistake made to the newline structure in scrape_quest.py
|
||
|
"""
|
||
|
import os
|
||
|
import re
|
||
|
import datetime
|
||
|
|
||
|
import bs4
|
||
|
import psycopg2
|
||
|
|
||
|
import config
|
||
|
|
||
|
BQ_DIR = '/var/www/html/banished/'
|
||
|
|
||
|
con = psycopg2.connect(**config.db)
|
||
|
cur = con.cursor()
|
||
|
|
||
|
def scrape_posts(root_dir):
|
||
|
"""Walks the `root_dir` and extracts data from index.html files found."""
|
||
|
for dir_name, sub_dirs, files in os.walk(root_dir):
|
||
|
for file in files:
|
||
|
if file != 'index.html':
|
||
|
continue
|
||
|
filepath = os.path.join(root_dir, dir_name, file)
|
||
|
print("Processing:", filepath)
|
||
|
with open(filepath, 'r') as file:
|
||
|
data = file.read()
|
||
|
data = re.sub(r'<wbr ?\/?>', '', data)
|
||
|
soup = bs4.BeautifulSoup(data, 'html.parser')
|
||
|
|
||
|
thread_id = int(soup.find(class_='thread').get('id')[1:])
|
||
|
|
||
|
posts = soup.find_all(class_='postContainer')
|
||
|
for post in posts:
|
||
|
post_id = int(post.get('id')[2:])
|
||
|
|
||
|
post_body = post.find(class_='postMessage')
|
||
|
for br in post_body.find_all('br'):
|
||
|
br.replace_with('\n')
|
||
|
post_body_txt = post_body.get_text()
|
||
|
|
||
|
cur.execute(
|
||
|
"UPDATE post SET body = (%s) WHERE id = (%s)",
|
||
|
(post_body_txt, post_id)
|
||
|
)
|
||
|
|
||
|
if __name__ == '__main__':
|
||
|
scrape_posts(os.path.join(BQ_DIR, 'archive'))
|
||
|
scrape_posts(os.path.join(BQ_DIR, 'qstarchive'))
|
||
|
con.commit()
|
||
|
con.close()
|