first commit

This commit is contained in:
iou1name 2019-12-06 18:13:43 -05:00
commit e64bdab04b
10 changed files with 285 additions and 0 deletions

4
.gitignore vendored Normal file
View File

@ -0,0 +1,4 @@
__pycache__/
*.swp
*.swo
config.py

15
LICENSE Normal file
View File

@ -0,0 +1,15 @@
ISC License
Copyright (c) 2019, iou1name <iou1name@steelbea.me>
Permission to use, copy, modify, and/or distribute this software for any
purpose with or without fee is hereby granted, provided that the above
copyright notice and this permission notice appear in all copies.
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.

21
README.md Normal file
View File

@ -0,0 +1,21 @@
# Voyage
Life is a journey.
## Requirements
Python 3.7+
PostgreSQL 11.5+
Python packages: `gunicorn aiohttp aiohttp_jinja2 asyncpg uvloop`
## Install
```
$ psql
postgres=# CREATE DATABASE "voyage";
postgres=# CREATE USER "voyage" WITH PASSWORD 'password';
postgres=# GRANT ALL PRIVILEGES ON DATABASE "voyage" TO "voyage";
postgres=# \q
```
1. Get on the floor
2. Walk the dinosaur
## Usage
`gunicorn voyage:init_app --bind localhost:5450 --worker-class aiohttp.GunicornWebWorker`

17
config.py.template Normal file
View File

@ -0,0 +1,17 @@
#!/usr/bin/env python3
"""
Configuration settings for Buckler.
`server_domain` is the server's domain.
`url_prefix` is the root path you wish app to reside at
eg. https://example.com/voyage
`db` specifies parameters for connecting to the PostgreSQL database.
"""
url_prefix = '/voyage'
db = {
'database': 'voyage',
'user': 'voyage',
'password': """password""",
'host': 'localhost',
'port': 5432,
}

132
scrape_quest.py Normal file
View File

@ -0,0 +1,132 @@
#!/usr/bin/env python3
"""
Extracts post data from a quest archive.
"""
import os
import re
import datetime
import bs4
import psycopg2
import config
BQ_DIR = '/var/www/html/banished/'
QM_NAMES = ['Soma', 'Somas']
con = psycopg2.connect(**config.db)
cur = con.cursor()
def init_db():
"""Initializes the database if it hasn't been already."""
with open('voyage.sql', 'r') as file:
cur.execute(file.read())
def scrape_posts(root_dir):
"""Walks the `root_dir` and extracts data from index.html files found."""
for dir_name, sub_dirs, files in os.walk(root_dir):
for file in files:
if file != 'index.html':
continue
filepath = os.path.join(root_dir, dir_name, file)
print("Processing:", filepath)
with open(filepath, 'r') as file:
soup = bs4.BeautifulSoup(file.read(), 'html.parser')
mobiles = soup.find_all(class_='mobile')
for tag in mobiles:
tag.decompose()
thread_id = int(soup.find(class_='thread').get('id')[1:])
thread_title = soup.find(class_='subject').text
thread_time = int(soup.find(class_='dateTime').get('data-utc'))
thread_time = datetime.datetime.utcfromtimestamp(thread_time)
thread_time = thread_time.replace(tzinfo=datetime.timezone.utc)
cur.execute("INSERT INTO thread VALUES (%s,%s,%s)",
(thread_id, thread_title, thread_time)
)
posts = soup.find_all(class_='postContainer')
for post in posts:
# information gathering
post_id = int(post.get('id')[2:])
name = post.find(class_='name').text
trip_code = post.find(class_='postertrip')
if trip_code:
trip_code = trip_code.text
subject = post.find(class_='subject')
if subject:
subject = subject.text
post_time = int(post.find(class_='dateTime').get('data-utc'))
post_time = datetime.datetime.utcfromtimestamp(post_time)
post_time = post_time.replace(tzinfo=datetime.timezone.utc)
file_url = None
file_name = None
file_md5 = None
file_text = post.find(class_='fileText')
if file_text:
file_url = file_text.a.get('href')
file_name = file_text.a.text
file_md5 =post.find(class_='fileThumb').img.get('data-md5')
post_body = post.find(class_='postMessage').get_text('\n')
links = post.find_all(class_='quotelink')
links = [l for l in links if l.get('href').startswith('#')]
links = [int(link.text[2:]) for link in links]
# heuristics
tags = []
if name in QM_NAMES:
tags.append('qm_post')
# also counts shitposters and broken tripcodes
tags.append('story_post')
# assume every QM post is also a story post, until
# proven otherwise
if "dropped my trip" in post_body.lower():
for link in links:
cur.execute("INSERT INTO tag "
"VALUES (%s,%s), (%s,%s), (%s,%s)",
(link, 'qm_post',
link, 'story_post',
link, 'dropped_trip'
)
)
# dropped trip doesn't necessarily mean story_post
tags.append('dropped_trip')
if len(links) > 1:
tags.append('vote_tally_post')
# also counts Q&A posts
try:
tags.remove('story_post')
except ValueError:
pass
if posts.index(post) == 0:
tags.append('op_post')
if "Welcome to Banished Quest!" in post_body:
try:
tags.remove('story_post')
except ValueError:
pass
# database insert
cur.execute(
"INSERT INTO post VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)",
(thread_id, post_id, name, trip_code, subject,
post_time, file_url, file_name, file_md5, post_body)
)
for link in links:
cur.execute("INSERT INTO link VALUES (%s,%s)",
(post_id, link)
)
for tag in tags:
cur.execute("INSERT INTO tag VALUES (%s,%s)",
(post_id, tag)
)
if __name__ == '__main__':
init_db()
scrape_posts(os.path.join(BQ_DIR, 'archive'))
scrape_posts(os.path.join(BQ_DIR, 'qstarchive'))
con.commit()
con.close()

0
static/voyage.css Normal file
View File

0
static/voyage.js Normal file
View File

22
templates/index.html Normal file
View File

@ -0,0 +1,22 @@
<!DOCTYPE html>
<html lang="en">
<head>
<title>Voyage</title>
<link rel="stylesheet" type="text/css" href="/static/voyage.css">
<script type="text/javascript" src="/static/voyage.js"></script>
<meta name="viewport" content="width=device-width, initial-scale=0.8">
<meta name="description" content="A quest archive viewer.">
</head>
<body>
<header>
<h1>Voyage</h1>
</header>
<main>
<ul>
{% for thread in threads %}
<li>{{ thread.title }}</li>
{% endfor %}
</ul>
</main>
</body>
</html>

46
voyage.py Normal file
View File

@ -0,0 +1,46 @@
#!/usr/bin/env python3
"""
A display interface for archived 4chan quest threads.
"""
from aiohttp import web
import jinja2
import aiohttp_jinja2
from aiohttp_jinja2 import render_template
import asyncpg
import uvloop
import config
uvloop.install()
routes = web.RouteTableDef()
@routes.get('/', name='index')
async def index(request):
"""The index page."""
async with request.app['pool'].acquire() as conn:
threads = await conn.fetch("SELECT * FROM thread ORDER BY time ASC")
return render_template("index.html", request, locals())
async def init_app():
"""Initializes the application."""
app = web.Application()
aiohttp_jinja2.setup(
app,
trim_blocks=True,
lstrip_blocks=True,
undefined=jinja2.StrictUndefined,
loader=jinja2.FileSystemLoader('templates'),
)
app['pool'] = await asyncpg.create_pool(**config.db)
app.router.add_routes(routes)
app_wrap = web.Application()
app_wrap.add_subapp(config.url_prefix, app)
return app_wrap
if __name__ == "__main__":
app = init_app()
web.run_app(app, host='0.0.0.0', port=5450)

28
voyage.sql Normal file
View File

@ -0,0 +1,28 @@
CREATE TABLE IF NOT EXISTS thread(
id INTEGER PRIMARY KEY,
title TEXT NOT NULL,
time TIMESTAMP WITH TIME ZONE NOT NULL
);
CREATE TABLE IF NOT EXISTS post(
thread_id INTEGER REFERENCES thread(id) ON DELETE CASCADE NOT NULL,
id INTEGER PRIMARY KEY,
name TEXT NOT NULL,
trip_code TEXT,
subject TEXT,
time TIMESTAMP WITH TIME ZONE NOT NULL,
file_url TEXT,
file_name TEXT,
file_md5 TEXT,
body TEXT NOT NULL
);
CREATE TABLE IF NOT EXISTS link(
link_from INTEGER REFERENCES post(id) ON DELETE CASCADE NOT NULL,
link_to INTEGER REFERENCES post(id) ON DELETE CASCADE NOT NULL
);
CREATE TABLE IF NOT EXISTS tag(
post_id INTEGER REFERENCES post(id) ON DELETE CASCADE NOT NULL,
name TEXT NOT NULL
);