fix br
This commit is contained in:
parent
bcfe40292d
commit
022e35e13c
52
fix_br.py
Normal file
52
fix_br.py
Normal file
|
@ -0,0 +1,52 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Fixes a mistake made to the newline structure in scrape_quest.py
|
||||
"""
|
||||
import os
|
||||
import re
|
||||
import datetime
|
||||
|
||||
import bs4
|
||||
import psycopg2
|
||||
|
||||
import config
|
||||
|
||||
BQ_DIR = '/var/www/html/banished/'
|
||||
|
||||
con = psycopg2.connect(**config.db)
|
||||
cur = con.cursor()
|
||||
|
||||
def scrape_posts(root_dir):
|
||||
"""Walks the `root_dir` and extracts data from index.html files found."""
|
||||
for dir_name, sub_dirs, files in os.walk(root_dir):
|
||||
for file in files:
|
||||
if file != 'index.html':
|
||||
continue
|
||||
filepath = os.path.join(root_dir, dir_name, file)
|
||||
print("Processing:", filepath)
|
||||
with open(filepath, 'r') as file:
|
||||
data = file.read()
|
||||
data = re.sub(r'<wbr ?\/?>', '', data)
|
||||
soup = bs4.BeautifulSoup(data, 'html.parser')
|
||||
|
||||
thread_id = int(soup.find(class_='thread').get('id')[1:])
|
||||
|
||||
posts = soup.find_all(class_='postContainer')
|
||||
for post in posts:
|
||||
post_id = int(post.get('id')[2:])
|
||||
|
||||
post_body = post.find(class_='postMessage')
|
||||
for br in post_body.find_all('br'):
|
||||
br.replace_with('\n')
|
||||
post_body_txt = post_body.get_text()
|
||||
|
||||
cur.execute(
|
||||
"UPDATE post SET body = (%s) WHERE id = (%s)",
|
||||
(post_body_txt, post_id)
|
||||
)
|
||||
|
||||
if __name__ == '__main__':
|
||||
scrape_posts(os.path.join(BQ_DIR, 'archive'))
|
||||
scrape_posts(os.path.join(BQ_DIR, 'qstarchive'))
|
||||
con.commit()
|
||||
con.close()
|
Loading…
Reference in New Issue
Block a user