From e64bdab04bfb1bc6a4ca745afa3e9bc95e374264 Mon Sep 17 00:00:00 2001 From: iou1name Date: Fri, 6 Dec 2019 18:13:43 -0500 Subject: [PATCH] first commit --- .gitignore | 4 ++ LICENSE | 15 +++++ README.md | 21 +++++++ config.py.template | 17 ++++++ scrape_quest.py | 132 +++++++++++++++++++++++++++++++++++++++++++ static/voyage.css | 0 static/voyage.js | 0 templates/index.html | 22 ++++++++ voyage.py | 46 +++++++++++++++ voyage.sql | 28 +++++++++ 10 files changed, 285 insertions(+) create mode 100644 .gitignore create mode 100644 LICENSE create mode 100644 README.md create mode 100644 config.py.template create mode 100644 scrape_quest.py create mode 100644 static/voyage.css create mode 100644 static/voyage.js create mode 100644 templates/index.html create mode 100644 voyage.py create mode 100644 voyage.sql diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..610faed --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +__pycache__/ +*.swp +*.swo +config.py diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..410723e --- /dev/null +++ b/LICENSE @@ -0,0 +1,15 @@ +ISC License + +Copyright (c) 2019, iou1name + +Permission to use, copy, modify, and/or distribute this software for any +purpose with or without fee is hereby granted, provided that the above +copyright notice and this permission notice appear in all copies. + +THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF +OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..41b7896 --- /dev/null +++ b/README.md @@ -0,0 +1,21 @@ +# Voyage +Life is a journey. + +## Requirements +Python 3.7+ +PostgreSQL 11.5+ +Python packages: `gunicorn aiohttp aiohttp_jinja2 asyncpg uvloop` + +## Install +``` +$ psql +postgres=# CREATE DATABASE "voyage"; +postgres=# CREATE USER "voyage" WITH PASSWORD 'password'; +postgres=# GRANT ALL PRIVILEGES ON DATABASE "voyage" TO "voyage"; +postgres=# \q +``` +1. Get on the floor +2. Walk the dinosaur + +## Usage +`gunicorn voyage:init_app --bind localhost:5450 --worker-class aiohttp.GunicornWebWorker` diff --git a/config.py.template b/config.py.template new file mode 100644 index 0000000..e66f5f5 --- /dev/null +++ b/config.py.template @@ -0,0 +1,17 @@ +#!/usr/bin/env python3 +""" +Configuration settings for Buckler. +`server_domain` is the server's domain. +`url_prefix` is the root path you wish app to reside at +eg. https://example.com/voyage +`db` specifies parameters for connecting to the PostgreSQL database. +""" +url_prefix = '/voyage' + +db = { + 'database': 'voyage', + 'user': 'voyage', + 'password': """password""", + 'host': 'localhost', + 'port': 5432, +} diff --git a/scrape_quest.py b/scrape_quest.py new file mode 100644 index 0000000..8b74937 --- /dev/null +++ b/scrape_quest.py @@ -0,0 +1,132 @@ +#!/usr/bin/env python3 +""" +Extracts post data from a quest archive. +""" +import os +import re +import datetime + +import bs4 +import psycopg2 + +import config + +BQ_DIR = '/var/www/html/banished/' +QM_NAMES = ['Soma', 'Somas'] + +con = psycopg2.connect(**config.db) +cur = con.cursor() + +def init_db(): + """Initializes the database if it hasn't been already.""" + with open('voyage.sql', 'r') as file: + cur.execute(file.read()) + + +def scrape_posts(root_dir): + """Walks the `root_dir` and extracts data from index.html files found.""" + for dir_name, sub_dirs, files in os.walk(root_dir): + for file in files: + if file != 'index.html': + continue + filepath = os.path.join(root_dir, dir_name, file) + print("Processing:", filepath) + with open(filepath, 'r') as file: + soup = bs4.BeautifulSoup(file.read(), 'html.parser') + + mobiles = soup.find_all(class_='mobile') + for tag in mobiles: + tag.decompose() + + thread_id = int(soup.find(class_='thread').get('id')[1:]) + thread_title = soup.find(class_='subject').text + thread_time = int(soup.find(class_='dateTime').get('data-utc')) + thread_time = datetime.datetime.utcfromtimestamp(thread_time) + thread_time = thread_time.replace(tzinfo=datetime.timezone.utc) + cur.execute("INSERT INTO thread VALUES (%s,%s,%s)", + (thread_id, thread_title, thread_time) + ) + posts = soup.find_all(class_='postContainer') + for post in posts: + # information gathering + post_id = int(post.get('id')[2:]) + name = post.find(class_='name').text + trip_code = post.find(class_='postertrip') + if trip_code: + trip_code = trip_code.text + subject = post.find(class_='subject') + if subject: + subject = subject.text + post_time = int(post.find(class_='dateTime').get('data-utc')) + post_time = datetime.datetime.utcfromtimestamp(post_time) + post_time = post_time.replace(tzinfo=datetime.timezone.utc) + file_url = None + file_name = None + file_md5 = None + file_text = post.find(class_='fileText') + if file_text: + file_url = file_text.a.get('href') + file_name = file_text.a.text + file_md5 =post.find(class_='fileThumb').img.get('data-md5') + post_body = post.find(class_='postMessage').get_text('\n') + + links = post.find_all(class_='quotelink') + links = [l for l in links if l.get('href').startswith('#')] + links = [int(link.text[2:]) for link in links] + + # heuristics + tags = [] + if name in QM_NAMES: + tags.append('qm_post') + # also counts shitposters and broken tripcodes + tags.append('story_post') + # assume every QM post is also a story post, until + # proven otherwise + if "dropped my trip" in post_body.lower(): + for link in links: + cur.execute("INSERT INTO tag " + "VALUES (%s,%s), (%s,%s), (%s,%s)", + (link, 'qm_post', + link, 'story_post', + link, 'dropped_trip' + ) + ) + # dropped trip doesn't necessarily mean story_post + tags.append('dropped_trip') + if len(links) > 1: + tags.append('vote_tally_post') + # also counts Q&A posts + try: + tags.remove('story_post') + except ValueError: + pass + if posts.index(post) == 0: + tags.append('op_post') + if "Welcome to Banished Quest!" in post_body: + try: + tags.remove('story_post') + except ValueError: + pass + + # database insert + cur.execute( + "INSERT INTO post VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)", + (thread_id, post_id, name, trip_code, subject, + post_time, file_url, file_name, file_md5, post_body) + ) + for link in links: + cur.execute("INSERT INTO link VALUES (%s,%s)", + (post_id, link) + ) + for tag in tags: + cur.execute("INSERT INTO tag VALUES (%s,%s)", + (post_id, tag) + ) + + +if __name__ == '__main__': + init_db() + scrape_posts(os.path.join(BQ_DIR, 'archive')) + scrape_posts(os.path.join(BQ_DIR, 'qstarchive')) + con.commit() + con.close() diff --git a/static/voyage.css b/static/voyage.css new file mode 100644 index 0000000..e69de29 diff --git a/static/voyage.js b/static/voyage.js new file mode 100644 index 0000000..e69de29 diff --git a/templates/index.html b/templates/index.html new file mode 100644 index 0000000..4721157 --- /dev/null +++ b/templates/index.html @@ -0,0 +1,22 @@ + + + + Voyage + + + + + + +
+

Voyage

+
+
+
    + {% for thread in threads %} +
  • {{ thread.title }}
  • + {% endfor %} +
+
+ + diff --git a/voyage.py b/voyage.py new file mode 100644 index 0000000..d63ff5d --- /dev/null +++ b/voyage.py @@ -0,0 +1,46 @@ +#!/usr/bin/env python3 +""" +A display interface for archived 4chan quest threads. +""" +from aiohttp import web +import jinja2 +import aiohttp_jinja2 +from aiohttp_jinja2 import render_template +import asyncpg +import uvloop + +import config + +uvloop.install() +routes = web.RouteTableDef() + +@routes.get('/', name='index') +async def index(request): + """The index page.""" + async with request.app['pool'].acquire() as conn: + threads = await conn.fetch("SELECT * FROM thread ORDER BY time ASC") + return render_template("index.html", request, locals()) + + +async def init_app(): + """Initializes the application.""" + app = web.Application() + aiohttp_jinja2.setup( + app, + trim_blocks=True, + lstrip_blocks=True, + undefined=jinja2.StrictUndefined, + loader=jinja2.FileSystemLoader('templates'), + ) + app['pool'] = await asyncpg.create_pool(**config.db) + + app.router.add_routes(routes) + + app_wrap = web.Application() + app_wrap.add_subapp(config.url_prefix, app) + return app_wrap + + +if __name__ == "__main__": + app = init_app() + web.run_app(app, host='0.0.0.0', port=5450) diff --git a/voyage.sql b/voyage.sql new file mode 100644 index 0000000..2818cdc --- /dev/null +++ b/voyage.sql @@ -0,0 +1,28 @@ +CREATE TABLE IF NOT EXISTS thread( + id INTEGER PRIMARY KEY, + title TEXT NOT NULL, + time TIMESTAMP WITH TIME ZONE NOT NULL +); + +CREATE TABLE IF NOT EXISTS post( + thread_id INTEGER REFERENCES thread(id) ON DELETE CASCADE NOT NULL, + id INTEGER PRIMARY KEY, + name TEXT NOT NULL, + trip_code TEXT, + subject TEXT, + time TIMESTAMP WITH TIME ZONE NOT NULL, + file_url TEXT, + file_name TEXT, + file_md5 TEXT, + body TEXT NOT NULL +); + +CREATE TABLE IF NOT EXISTS link( + link_from INTEGER REFERENCES post(id) ON DELETE CASCADE NOT NULL, + link_to INTEGER REFERENCES post(id) ON DELETE CASCADE NOT NULL +); + +CREATE TABLE IF NOT EXISTS tag( + post_id INTEGER REFERENCES post(id) ON DELETE CASCADE NOT NULL, + name TEXT NOT NULL +);