fifth commit
This commit is contained in:
parent
715fc2af8e
commit
b15de6f803
|
@ -32,7 +32,9 @@ def scrape_posts(root_dir):
|
||||||
filepath = os.path.join(root_dir, dir_name, file)
|
filepath = os.path.join(root_dir, dir_name, file)
|
||||||
print("Processing:", filepath)
|
print("Processing:", filepath)
|
||||||
with open(filepath, 'r') as file:
|
with open(filepath, 'r') as file:
|
||||||
soup = bs4.BeautifulSoup(file.read(), 'html.parser')
|
data = file.read()
|
||||||
|
data = re.sub(r'<wbr ?\/?>', '', data)
|
||||||
|
soup = bs4.BeautifulSoup(data, 'html.parser')
|
||||||
|
|
||||||
mobiles = soup.find_all(class_='mobile')
|
mobiles = soup.find_all(class_='mobile')
|
||||||
for tag in mobiles:
|
for tag in mobiles:
|
||||||
|
@ -74,11 +76,11 @@ def scrape_posts(root_dir):
|
||||||
links = list(set(links))
|
links = list(set(links))
|
||||||
|
|
||||||
# heuristics
|
# heuristics
|
||||||
tags = []
|
tags = set()
|
||||||
if name in QM_NAMES:
|
if name in QM_NAMES:
|
||||||
tags.append('qm_post')
|
tags.add('qm_post')
|
||||||
# also counts shitposters and broken tripcodes
|
# also counts shitposters and broken tripcodes
|
||||||
tags.append('story_post')
|
tags.add('story_post')
|
||||||
# assume every QM post is also a story post, until
|
# assume every QM post is also a story post, until
|
||||||
# proven otherwise
|
# proven otherwise
|
||||||
if "dropped my trip" in post_body.lower():
|
if "dropped my trip" in post_body.lower():
|
||||||
|
@ -91,28 +93,34 @@ def scrape_posts(root_dir):
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
# dropped trip doesn't necessarily mean story_post
|
# dropped trip doesn't necessarily mean story_post
|
||||||
tags.append('dropped_trip')
|
tags.add('dropped_trip')
|
||||||
if len(links) > 1:
|
if len(links) > 1:
|
||||||
tags.append('tally_post')
|
tags.add('tally_post')
|
||||||
# also counts Q&A posts
|
# also counts Q&A posts
|
||||||
if 'story_post' in tags:
|
if 'story_post' in tags:
|
||||||
tags.remove('story_post')
|
tags.remove('story_post')
|
||||||
|
if 'writin' in post_body.lower():
|
||||||
|
tags.add('tally_post')
|
||||||
|
if 'story_post' in tags:
|
||||||
|
tags.remove('story_post')
|
||||||
if posts.index(post) == 0:
|
if posts.index(post) == 0:
|
||||||
tags.append('op_post')
|
tags.add('op_post')
|
||||||
if "Welcome to Banished Quest!" in post_body:
|
if "Welcome to Banished Quest!" in post_body:
|
||||||
if 'story_post' in tags:
|
if 'story_post' in tags:
|
||||||
tags.remove('story_post')
|
tags.remove('story_post')
|
||||||
if re.search(r'ro+l+ me', post_body.lower()):
|
if re.search(r'ro+l+ me', post_body.lower()):
|
||||||
tags.append('dice_call')
|
tags.add('dice_call')
|
||||||
if 'story_post' in tags:
|
if 'story_post' in tags:
|
||||||
tags.remove('story_post')
|
tags.remove('story_post')
|
||||||
|
if re.search(r'roll .*3d10', post_body.lower()):
|
||||||
|
tags.add('dice_call')
|
||||||
if 'final destination' in post_body.lower():
|
if 'final destination' in post_body.lower():
|
||||||
tags.append('final_destination')
|
tags.add('final_destination')
|
||||||
if 'story_post' in tags:
|
if 'story_post' in tags:
|
||||||
tags.remove('story_post')
|
tags.remove('story_post')
|
||||||
if 'story_post' in tags:
|
if 'story_post' in tags:
|
||||||
if len(re.findall(r'\n>', post_body)) > 1:
|
if len(re.findall(r'\n>', post_body)) > 1:
|
||||||
tags.append('vote_choices')
|
tags.add('vote_choices')
|
||||||
|
|
||||||
# database insert
|
# database insert
|
||||||
cur.execute(
|
cur.execute(
|
||||||
|
|
|
@ -20,7 +20,7 @@ body {
|
||||||
|
|
||||||
.tag_button {
|
.tag_button {
|
||||||
color: blue;
|
color: blue;
|
||||||
font-size: 1em;
|
font-size: 1.5em;
|
||||||
}
|
}
|
||||||
|
|
||||||
.tag_button:hover {
|
.tag_button:hover {
|
||||||
|
|
Loading…
Reference in New Issue
Block a user