first commit
This commit is contained in:
commit
e64bdab04b
4
.gitignore
vendored
Normal file
4
.gitignore
vendored
Normal file
|
@ -0,0 +1,4 @@
|
||||||
|
__pycache__/
|
||||||
|
*.swp
|
||||||
|
*.swo
|
||||||
|
config.py
|
15
LICENSE
Normal file
15
LICENSE
Normal file
|
@ -0,0 +1,15 @@
|
||||||
|
ISC License
|
||||||
|
|
||||||
|
Copyright (c) 2019, iou1name <iou1name@steelbea.me>
|
||||||
|
|
||||||
|
Permission to use, copy, modify, and/or distribute this software for any
|
||||||
|
purpose with or without fee is hereby granted, provided that the above
|
||||||
|
copyright notice and this permission notice appear in all copies.
|
||||||
|
|
||||||
|
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
|
||||||
|
WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
|
||||||
|
MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
|
||||||
|
ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
|
||||||
|
WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
|
||||||
|
ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
|
||||||
|
OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
21
README.md
Normal file
21
README.md
Normal file
|
@ -0,0 +1,21 @@
|
||||||
|
# Voyage
|
||||||
|
Life is a journey.
|
||||||
|
|
||||||
|
## Requirements
|
||||||
|
Python 3.7+
|
||||||
|
PostgreSQL 11.5+
|
||||||
|
Python packages: `gunicorn aiohttp aiohttp_jinja2 asyncpg uvloop`
|
||||||
|
|
||||||
|
## Install
|
||||||
|
```
|
||||||
|
$ psql
|
||||||
|
postgres=# CREATE DATABASE "voyage";
|
||||||
|
postgres=# CREATE USER "voyage" WITH PASSWORD 'password';
|
||||||
|
postgres=# GRANT ALL PRIVILEGES ON DATABASE "voyage" TO "voyage";
|
||||||
|
postgres=# \q
|
||||||
|
```
|
||||||
|
1. Get on the floor
|
||||||
|
2. Walk the dinosaur
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
`gunicorn voyage:init_app --bind localhost:5450 --worker-class aiohttp.GunicornWebWorker`
|
17
config.py.template
Normal file
17
config.py.template
Normal file
|
@ -0,0 +1,17 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Configuration settings for Buckler.
|
||||||
|
`server_domain` is the server's domain.
|
||||||
|
`url_prefix` is the root path you wish app to reside at
|
||||||
|
eg. https://example.com/voyage
|
||||||
|
`db` specifies parameters for connecting to the PostgreSQL database.
|
||||||
|
"""
|
||||||
|
url_prefix = '/voyage'
|
||||||
|
|
||||||
|
db = {
|
||||||
|
'database': 'voyage',
|
||||||
|
'user': 'voyage',
|
||||||
|
'password': """password""",
|
||||||
|
'host': 'localhost',
|
||||||
|
'port': 5432,
|
||||||
|
}
|
132
scrape_quest.py
Normal file
132
scrape_quest.py
Normal file
|
@ -0,0 +1,132 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Extracts post data from a quest archive.
|
||||||
|
"""
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import datetime
|
||||||
|
|
||||||
|
import bs4
|
||||||
|
import psycopg2
|
||||||
|
|
||||||
|
import config
|
||||||
|
|
||||||
|
BQ_DIR = '/var/www/html/banished/'
|
||||||
|
QM_NAMES = ['Soma', 'Somas']
|
||||||
|
|
||||||
|
con = psycopg2.connect(**config.db)
|
||||||
|
cur = con.cursor()
|
||||||
|
|
||||||
|
def init_db():
|
||||||
|
"""Initializes the database if it hasn't been already."""
|
||||||
|
with open('voyage.sql', 'r') as file:
|
||||||
|
cur.execute(file.read())
|
||||||
|
|
||||||
|
|
||||||
|
def scrape_posts(root_dir):
|
||||||
|
"""Walks the `root_dir` and extracts data from index.html files found."""
|
||||||
|
for dir_name, sub_dirs, files in os.walk(root_dir):
|
||||||
|
for file in files:
|
||||||
|
if file != 'index.html':
|
||||||
|
continue
|
||||||
|
filepath = os.path.join(root_dir, dir_name, file)
|
||||||
|
print("Processing:", filepath)
|
||||||
|
with open(filepath, 'r') as file:
|
||||||
|
soup = bs4.BeautifulSoup(file.read(), 'html.parser')
|
||||||
|
|
||||||
|
mobiles = soup.find_all(class_='mobile')
|
||||||
|
for tag in mobiles:
|
||||||
|
tag.decompose()
|
||||||
|
|
||||||
|
thread_id = int(soup.find(class_='thread').get('id')[1:])
|
||||||
|
thread_title = soup.find(class_='subject').text
|
||||||
|
thread_time = int(soup.find(class_='dateTime').get('data-utc'))
|
||||||
|
thread_time = datetime.datetime.utcfromtimestamp(thread_time)
|
||||||
|
thread_time = thread_time.replace(tzinfo=datetime.timezone.utc)
|
||||||
|
cur.execute("INSERT INTO thread VALUES (%s,%s,%s)",
|
||||||
|
(thread_id, thread_title, thread_time)
|
||||||
|
)
|
||||||
|
posts = soup.find_all(class_='postContainer')
|
||||||
|
for post in posts:
|
||||||
|
# information gathering
|
||||||
|
post_id = int(post.get('id')[2:])
|
||||||
|
name = post.find(class_='name').text
|
||||||
|
trip_code = post.find(class_='postertrip')
|
||||||
|
if trip_code:
|
||||||
|
trip_code = trip_code.text
|
||||||
|
subject = post.find(class_='subject')
|
||||||
|
if subject:
|
||||||
|
subject = subject.text
|
||||||
|
post_time = int(post.find(class_='dateTime').get('data-utc'))
|
||||||
|
post_time = datetime.datetime.utcfromtimestamp(post_time)
|
||||||
|
post_time = post_time.replace(tzinfo=datetime.timezone.utc)
|
||||||
|
file_url = None
|
||||||
|
file_name = None
|
||||||
|
file_md5 = None
|
||||||
|
file_text = post.find(class_='fileText')
|
||||||
|
if file_text:
|
||||||
|
file_url = file_text.a.get('href')
|
||||||
|
file_name = file_text.a.text
|
||||||
|
file_md5 =post.find(class_='fileThumb').img.get('data-md5')
|
||||||
|
post_body = post.find(class_='postMessage').get_text('\n')
|
||||||
|
|
||||||
|
links = post.find_all(class_='quotelink')
|
||||||
|
links = [l for l in links if l.get('href').startswith('#')]
|
||||||
|
links = [int(link.text[2:]) for link in links]
|
||||||
|
|
||||||
|
# heuristics
|
||||||
|
tags = []
|
||||||
|
if name in QM_NAMES:
|
||||||
|
tags.append('qm_post')
|
||||||
|
# also counts shitposters and broken tripcodes
|
||||||
|
tags.append('story_post')
|
||||||
|
# assume every QM post is also a story post, until
|
||||||
|
# proven otherwise
|
||||||
|
if "dropped my trip" in post_body.lower():
|
||||||
|
for link in links:
|
||||||
|
cur.execute("INSERT INTO tag "
|
||||||
|
"VALUES (%s,%s), (%s,%s), (%s,%s)",
|
||||||
|
(link, 'qm_post',
|
||||||
|
link, 'story_post',
|
||||||
|
link, 'dropped_trip'
|
||||||
|
)
|
||||||
|
)
|
||||||
|
# dropped trip doesn't necessarily mean story_post
|
||||||
|
tags.append('dropped_trip')
|
||||||
|
if len(links) > 1:
|
||||||
|
tags.append('vote_tally_post')
|
||||||
|
# also counts Q&A posts
|
||||||
|
try:
|
||||||
|
tags.remove('story_post')
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
if posts.index(post) == 0:
|
||||||
|
tags.append('op_post')
|
||||||
|
if "Welcome to Banished Quest!" in post_body:
|
||||||
|
try:
|
||||||
|
tags.remove('story_post')
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# database insert
|
||||||
|
cur.execute(
|
||||||
|
"INSERT INTO post VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)",
|
||||||
|
(thread_id, post_id, name, trip_code, subject,
|
||||||
|
post_time, file_url, file_name, file_md5, post_body)
|
||||||
|
)
|
||||||
|
for link in links:
|
||||||
|
cur.execute("INSERT INTO link VALUES (%s,%s)",
|
||||||
|
(post_id, link)
|
||||||
|
)
|
||||||
|
for tag in tags:
|
||||||
|
cur.execute("INSERT INTO tag VALUES (%s,%s)",
|
||||||
|
(post_id, tag)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
init_db()
|
||||||
|
scrape_posts(os.path.join(BQ_DIR, 'archive'))
|
||||||
|
scrape_posts(os.path.join(BQ_DIR, 'qstarchive'))
|
||||||
|
con.commit()
|
||||||
|
con.close()
|
0
static/voyage.css
Normal file
0
static/voyage.css
Normal file
0
static/voyage.js
Normal file
0
static/voyage.js
Normal file
22
templates/index.html
Normal file
22
templates/index.html
Normal file
|
@ -0,0 +1,22 @@
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html lang="en">
|
||||||
|
<head>
|
||||||
|
<title>Voyage</title>
|
||||||
|
<link rel="stylesheet" type="text/css" href="/static/voyage.css">
|
||||||
|
<script type="text/javascript" src="/static/voyage.js"></script>
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=0.8">
|
||||||
|
<meta name="description" content="A quest archive viewer.">
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<header>
|
||||||
|
<h1>Voyage</h1>
|
||||||
|
</header>
|
||||||
|
<main>
|
||||||
|
<ul>
|
||||||
|
{% for thread in threads %}
|
||||||
|
<li>{{ thread.title }}</li>
|
||||||
|
{% endfor %}
|
||||||
|
</ul>
|
||||||
|
</main>
|
||||||
|
</body>
|
||||||
|
</html>
|
46
voyage.py
Normal file
46
voyage.py
Normal file
|
@ -0,0 +1,46 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
A display interface for archived 4chan quest threads.
|
||||||
|
"""
|
||||||
|
from aiohttp import web
|
||||||
|
import jinja2
|
||||||
|
import aiohttp_jinja2
|
||||||
|
from aiohttp_jinja2 import render_template
|
||||||
|
import asyncpg
|
||||||
|
import uvloop
|
||||||
|
|
||||||
|
import config
|
||||||
|
|
||||||
|
uvloop.install()
|
||||||
|
routes = web.RouteTableDef()
|
||||||
|
|
||||||
|
@routes.get('/', name='index')
|
||||||
|
async def index(request):
|
||||||
|
"""The index page."""
|
||||||
|
async with request.app['pool'].acquire() as conn:
|
||||||
|
threads = await conn.fetch("SELECT * FROM thread ORDER BY time ASC")
|
||||||
|
return render_template("index.html", request, locals())
|
||||||
|
|
||||||
|
|
||||||
|
async def init_app():
|
||||||
|
"""Initializes the application."""
|
||||||
|
app = web.Application()
|
||||||
|
aiohttp_jinja2.setup(
|
||||||
|
app,
|
||||||
|
trim_blocks=True,
|
||||||
|
lstrip_blocks=True,
|
||||||
|
undefined=jinja2.StrictUndefined,
|
||||||
|
loader=jinja2.FileSystemLoader('templates'),
|
||||||
|
)
|
||||||
|
app['pool'] = await asyncpg.create_pool(**config.db)
|
||||||
|
|
||||||
|
app.router.add_routes(routes)
|
||||||
|
|
||||||
|
app_wrap = web.Application()
|
||||||
|
app_wrap.add_subapp(config.url_prefix, app)
|
||||||
|
return app_wrap
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
app = init_app()
|
||||||
|
web.run_app(app, host='0.0.0.0', port=5450)
|
28
voyage.sql
Normal file
28
voyage.sql
Normal file
|
@ -0,0 +1,28 @@
|
||||||
|
CREATE TABLE IF NOT EXISTS thread(
|
||||||
|
id INTEGER PRIMARY KEY,
|
||||||
|
title TEXT NOT NULL,
|
||||||
|
time TIMESTAMP WITH TIME ZONE NOT NULL
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS post(
|
||||||
|
thread_id INTEGER REFERENCES thread(id) ON DELETE CASCADE NOT NULL,
|
||||||
|
id INTEGER PRIMARY KEY,
|
||||||
|
name TEXT NOT NULL,
|
||||||
|
trip_code TEXT,
|
||||||
|
subject TEXT,
|
||||||
|
time TIMESTAMP WITH TIME ZONE NOT NULL,
|
||||||
|
file_url TEXT,
|
||||||
|
file_name TEXT,
|
||||||
|
file_md5 TEXT,
|
||||||
|
body TEXT NOT NULL
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS link(
|
||||||
|
link_from INTEGER REFERENCES post(id) ON DELETE CASCADE NOT NULL,
|
||||||
|
link_to INTEGER REFERENCES post(id) ON DELETE CASCADE NOT NULL
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS tag(
|
||||||
|
post_id INTEGER REFERENCES post(id) ON DELETE CASCADE NOT NULL,
|
||||||
|
name TEXT NOT NULL
|
||||||
|
);
|
Loading…
Reference in New Issue
Block a user