first commit
This commit is contained in:
commit
e64bdab04b
4
.gitignore
vendored
Normal file
4
.gitignore
vendored
Normal file
|
@ -0,0 +1,4 @@
|
|||
__pycache__/
|
||||
*.swp
|
||||
*.swo
|
||||
config.py
|
15
LICENSE
Normal file
15
LICENSE
Normal file
|
@ -0,0 +1,15 @@
|
|||
ISC License
|
||||
|
||||
Copyright (c) 2019, iou1name <iou1name@steelbea.me>
|
||||
|
||||
Permission to use, copy, modify, and/or distribute this software for any
|
||||
purpose with or without fee is hereby granted, provided that the above
|
||||
copyright notice and this permission notice appear in all copies.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
|
||||
WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
|
||||
MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
|
||||
ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
|
||||
WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
|
||||
ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
|
||||
OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
21
README.md
Normal file
21
README.md
Normal file
|
@ -0,0 +1,21 @@
|
|||
# Voyage
|
||||
Life is a journey.
|
||||
|
||||
## Requirements
|
||||
Python 3.7+
|
||||
PostgreSQL 11.5+
|
||||
Python packages: `gunicorn aiohttp aiohttp_jinja2 asyncpg uvloop`
|
||||
|
||||
## Install
|
||||
```
|
||||
$ psql
|
||||
postgres=# CREATE DATABASE "voyage";
|
||||
postgres=# CREATE USER "voyage" WITH PASSWORD 'password';
|
||||
postgres=# GRANT ALL PRIVILEGES ON DATABASE "voyage" TO "voyage";
|
||||
postgres=# \q
|
||||
```
|
||||
1. Get on the floor
|
||||
2. Walk the dinosaur
|
||||
|
||||
## Usage
|
||||
`gunicorn voyage:init_app --bind localhost:5450 --worker-class aiohttp.GunicornWebWorker`
|
17
config.py.template
Normal file
17
config.py.template
Normal file
|
@ -0,0 +1,17 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Configuration settings for Buckler.
|
||||
`server_domain` is the server's domain.
|
||||
`url_prefix` is the root path you wish app to reside at
|
||||
eg. https://example.com/voyage
|
||||
`db` specifies parameters for connecting to the PostgreSQL database.
|
||||
"""
|
||||
url_prefix = '/voyage'
|
||||
|
||||
db = {
|
||||
'database': 'voyage',
|
||||
'user': 'voyage',
|
||||
'password': """password""",
|
||||
'host': 'localhost',
|
||||
'port': 5432,
|
||||
}
|
132
scrape_quest.py
Normal file
132
scrape_quest.py
Normal file
|
@ -0,0 +1,132 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Extracts post data from a quest archive.
|
||||
"""
|
||||
import os
|
||||
import re
|
||||
import datetime
|
||||
|
||||
import bs4
|
||||
import psycopg2
|
||||
|
||||
import config
|
||||
|
||||
BQ_DIR = '/var/www/html/banished/'
|
||||
QM_NAMES = ['Soma', 'Somas']
|
||||
|
||||
con = psycopg2.connect(**config.db)
|
||||
cur = con.cursor()
|
||||
|
||||
def init_db():
|
||||
"""Initializes the database if it hasn't been already."""
|
||||
with open('voyage.sql', 'r') as file:
|
||||
cur.execute(file.read())
|
||||
|
||||
|
||||
def scrape_posts(root_dir):
|
||||
"""Walks the `root_dir` and extracts data from index.html files found."""
|
||||
for dir_name, sub_dirs, files in os.walk(root_dir):
|
||||
for file in files:
|
||||
if file != 'index.html':
|
||||
continue
|
||||
filepath = os.path.join(root_dir, dir_name, file)
|
||||
print("Processing:", filepath)
|
||||
with open(filepath, 'r') as file:
|
||||
soup = bs4.BeautifulSoup(file.read(), 'html.parser')
|
||||
|
||||
mobiles = soup.find_all(class_='mobile')
|
||||
for tag in mobiles:
|
||||
tag.decompose()
|
||||
|
||||
thread_id = int(soup.find(class_='thread').get('id')[1:])
|
||||
thread_title = soup.find(class_='subject').text
|
||||
thread_time = int(soup.find(class_='dateTime').get('data-utc'))
|
||||
thread_time = datetime.datetime.utcfromtimestamp(thread_time)
|
||||
thread_time = thread_time.replace(tzinfo=datetime.timezone.utc)
|
||||
cur.execute("INSERT INTO thread VALUES (%s,%s,%s)",
|
||||
(thread_id, thread_title, thread_time)
|
||||
)
|
||||
posts = soup.find_all(class_='postContainer')
|
||||
for post in posts:
|
||||
# information gathering
|
||||
post_id = int(post.get('id')[2:])
|
||||
name = post.find(class_='name').text
|
||||
trip_code = post.find(class_='postertrip')
|
||||
if trip_code:
|
||||
trip_code = trip_code.text
|
||||
subject = post.find(class_='subject')
|
||||
if subject:
|
||||
subject = subject.text
|
||||
post_time = int(post.find(class_='dateTime').get('data-utc'))
|
||||
post_time = datetime.datetime.utcfromtimestamp(post_time)
|
||||
post_time = post_time.replace(tzinfo=datetime.timezone.utc)
|
||||
file_url = None
|
||||
file_name = None
|
||||
file_md5 = None
|
||||
file_text = post.find(class_='fileText')
|
||||
if file_text:
|
||||
file_url = file_text.a.get('href')
|
||||
file_name = file_text.a.text
|
||||
file_md5 =post.find(class_='fileThumb').img.get('data-md5')
|
||||
post_body = post.find(class_='postMessage').get_text('\n')
|
||||
|
||||
links = post.find_all(class_='quotelink')
|
||||
links = [l for l in links if l.get('href').startswith('#')]
|
||||
links = [int(link.text[2:]) for link in links]
|
||||
|
||||
# heuristics
|
||||
tags = []
|
||||
if name in QM_NAMES:
|
||||
tags.append('qm_post')
|
||||
# also counts shitposters and broken tripcodes
|
||||
tags.append('story_post')
|
||||
# assume every QM post is also a story post, until
|
||||
# proven otherwise
|
||||
if "dropped my trip" in post_body.lower():
|
||||
for link in links:
|
||||
cur.execute("INSERT INTO tag "
|
||||
"VALUES (%s,%s), (%s,%s), (%s,%s)",
|
||||
(link, 'qm_post',
|
||||
link, 'story_post',
|
||||
link, 'dropped_trip'
|
||||
)
|
||||
)
|
||||
# dropped trip doesn't necessarily mean story_post
|
||||
tags.append('dropped_trip')
|
||||
if len(links) > 1:
|
||||
tags.append('vote_tally_post')
|
||||
# also counts Q&A posts
|
||||
try:
|
||||
tags.remove('story_post')
|
||||
except ValueError:
|
||||
pass
|
||||
if posts.index(post) == 0:
|
||||
tags.append('op_post')
|
||||
if "Welcome to Banished Quest!" in post_body:
|
||||
try:
|
||||
tags.remove('story_post')
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
# database insert
|
||||
cur.execute(
|
||||
"INSERT INTO post VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)",
|
||||
(thread_id, post_id, name, trip_code, subject,
|
||||
post_time, file_url, file_name, file_md5, post_body)
|
||||
)
|
||||
for link in links:
|
||||
cur.execute("INSERT INTO link VALUES (%s,%s)",
|
||||
(post_id, link)
|
||||
)
|
||||
for tag in tags:
|
||||
cur.execute("INSERT INTO tag VALUES (%s,%s)",
|
||||
(post_id, tag)
|
||||
)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
init_db()
|
||||
scrape_posts(os.path.join(BQ_DIR, 'archive'))
|
||||
scrape_posts(os.path.join(BQ_DIR, 'qstarchive'))
|
||||
con.commit()
|
||||
con.close()
|
0
static/voyage.css
Normal file
0
static/voyage.css
Normal file
0
static/voyage.js
Normal file
0
static/voyage.js
Normal file
22
templates/index.html
Normal file
22
templates/index.html
Normal file
|
@ -0,0 +1,22 @@
|
|||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<title>Voyage</title>
|
||||
<link rel="stylesheet" type="text/css" href="/static/voyage.css">
|
||||
<script type="text/javascript" src="/static/voyage.js"></script>
|
||||
<meta name="viewport" content="width=device-width, initial-scale=0.8">
|
||||
<meta name="description" content="A quest archive viewer.">
|
||||
</head>
|
||||
<body>
|
||||
<header>
|
||||
<h1>Voyage</h1>
|
||||
</header>
|
||||
<main>
|
||||
<ul>
|
||||
{% for thread in threads %}
|
||||
<li>{{ thread.title }}</li>
|
||||
{% endfor %}
|
||||
</ul>
|
||||
</main>
|
||||
</body>
|
||||
</html>
|
46
voyage.py
Normal file
46
voyage.py
Normal file
|
@ -0,0 +1,46 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
A display interface for archived 4chan quest threads.
|
||||
"""
|
||||
from aiohttp import web
|
||||
import jinja2
|
||||
import aiohttp_jinja2
|
||||
from aiohttp_jinja2 import render_template
|
||||
import asyncpg
|
||||
import uvloop
|
||||
|
||||
import config
|
||||
|
||||
uvloop.install()
|
||||
routes = web.RouteTableDef()
|
||||
|
||||
@routes.get('/', name='index')
|
||||
async def index(request):
|
||||
"""The index page."""
|
||||
async with request.app['pool'].acquire() as conn:
|
||||
threads = await conn.fetch("SELECT * FROM thread ORDER BY time ASC")
|
||||
return render_template("index.html", request, locals())
|
||||
|
||||
|
||||
async def init_app():
|
||||
"""Initializes the application."""
|
||||
app = web.Application()
|
||||
aiohttp_jinja2.setup(
|
||||
app,
|
||||
trim_blocks=True,
|
||||
lstrip_blocks=True,
|
||||
undefined=jinja2.StrictUndefined,
|
||||
loader=jinja2.FileSystemLoader('templates'),
|
||||
)
|
||||
app['pool'] = await asyncpg.create_pool(**config.db)
|
||||
|
||||
app.router.add_routes(routes)
|
||||
|
||||
app_wrap = web.Application()
|
||||
app_wrap.add_subapp(config.url_prefix, app)
|
||||
return app_wrap
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
app = init_app()
|
||||
web.run_app(app, host='0.0.0.0', port=5450)
|
28
voyage.sql
Normal file
28
voyage.sql
Normal file
|
@ -0,0 +1,28 @@
|
|||
CREATE TABLE IF NOT EXISTS thread(
|
||||
id INTEGER PRIMARY KEY,
|
||||
title TEXT NOT NULL,
|
||||
time TIMESTAMP WITH TIME ZONE NOT NULL
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS post(
|
||||
thread_id INTEGER REFERENCES thread(id) ON DELETE CASCADE NOT NULL,
|
||||
id INTEGER PRIMARY KEY,
|
||||
name TEXT NOT NULL,
|
||||
trip_code TEXT,
|
||||
subject TEXT,
|
||||
time TIMESTAMP WITH TIME ZONE NOT NULL,
|
||||
file_url TEXT,
|
||||
file_name TEXT,
|
||||
file_md5 TEXT,
|
||||
body TEXT NOT NULL
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS link(
|
||||
link_from INTEGER REFERENCES post(id) ON DELETE CASCADE NOT NULL,
|
||||
link_to INTEGER REFERENCES post(id) ON DELETE CASCADE NOT NULL
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS tag(
|
||||
post_id INTEGER REFERENCES post(id) ON DELETE CASCADE NOT NULL,
|
||||
name TEXT NOT NULL
|
||||
);
|
Loading…
Reference in New Issue
Block a user