fifth commit

This commit is contained in:
iou1name 2019-12-20 15:11:21 -05:00
parent 715fc2af8e
commit b15de6f803
2 changed files with 19 additions and 11 deletions

View File

@ -32,7 +32,9 @@ def scrape_posts(root_dir):
filepath = os.path.join(root_dir, dir_name, file) filepath = os.path.join(root_dir, dir_name, file)
print("Processing:", filepath) print("Processing:", filepath)
with open(filepath, 'r') as file: with open(filepath, 'r') as file:
soup = bs4.BeautifulSoup(file.read(), 'html.parser') data = file.read()
data = re.sub(r'<wbr ?\/?>', '', data)
soup = bs4.BeautifulSoup(data, 'html.parser')
mobiles = soup.find_all(class_='mobile') mobiles = soup.find_all(class_='mobile')
for tag in mobiles: for tag in mobiles:
@ -74,11 +76,11 @@ def scrape_posts(root_dir):
links = list(set(links)) links = list(set(links))
# heuristics # heuristics
tags = [] tags = set()
if name in QM_NAMES: if name in QM_NAMES:
tags.append('qm_post') tags.add('qm_post')
# also counts shitposters and broken tripcodes # also counts shitposters and broken tripcodes
tags.append('story_post') tags.add('story_post')
# assume every QM post is also a story post, until # assume every QM post is also a story post, until
# proven otherwise # proven otherwise
if "dropped my trip" in post_body.lower(): if "dropped my trip" in post_body.lower():
@ -91,28 +93,34 @@ def scrape_posts(root_dir):
) )
) )
# dropped trip doesn't necessarily mean story_post # dropped trip doesn't necessarily mean story_post
tags.append('dropped_trip') tags.add('dropped_trip')
if len(links) > 1: if len(links) > 1:
tags.append('tally_post') tags.add('tally_post')
# also counts Q&A posts # also counts Q&A posts
if 'story_post' in tags: if 'story_post' in tags:
tags.remove('story_post') tags.remove('story_post')
if 'writin' in post_body.lower():
tags.add('tally_post')
if 'story_post' in tags:
tags.remove('story_post')
if posts.index(post) == 0: if posts.index(post) == 0:
tags.append('op_post') tags.add('op_post')
if "Welcome to Banished Quest!" in post_body: if "Welcome to Banished Quest!" in post_body:
if 'story_post' in tags: if 'story_post' in tags:
tags.remove('story_post') tags.remove('story_post')
if re.search(r'ro+l+ me', post_body.lower()): if re.search(r'ro+l+ me', post_body.lower()):
tags.append('dice_call') tags.add('dice_call')
if 'story_post' in tags: if 'story_post' in tags:
tags.remove('story_post') tags.remove('story_post')
if re.search(r'roll .*3d10', post_body.lower()):
tags.add('dice_call')
if 'final destination' in post_body.lower(): if 'final destination' in post_body.lower():
tags.append('final_destination') tags.add('final_destination')
if 'story_post' in tags: if 'story_post' in tags:
tags.remove('story_post') tags.remove('story_post')
if 'story_post' in tags: if 'story_post' in tags:
if len(re.findall(r'\n>', post_body)) > 1: if len(re.findall(r'\n>', post_body)) > 1:
tags.append('vote_choices') tags.add('vote_choices')
# database insert # database insert
cur.execute( cur.execute(

View File

@ -20,7 +20,7 @@ body {
.tag_button { .tag_button {
color: blue; color: blue;
font-size: 1em; font-size: 1.5em;
} }
.tag_button:hover { .tag_button:hover {