first commit

2019-12-06 18:13:43 -05:00 · 2019-12-06 18:13:43 -05:00 · e64bdab04b
commit e64bdab04b
10 changed files with 285 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,4 @@
+__pycache__/
+*.swp
+*.swo
+config.py
--- a/15
+++ b/15
@ -0,0 +1,15 @@
+ISC License
+
+Copyright (c) 2019, iou1name <iou1name@steelbea.me>
+
+Permission to use, copy, modify, and/or distribute this software for any
+purpose with or without fee is hereby granted, provided that the above
+copyright notice and this permission notice appear in all copies.
+
+THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
--- a/README.md
+++ b/README.md
@ -0,0 +1,21 @@
+# Voyage  
+Life is a journey.  
+
+## Requirements  
+Python 3.7+  
+PostgreSQL 11.5+  
+Python packages: `gunicorn aiohttp aiohttp_jinja2 asyncpg uvloop`  
+
+## Install  
+```
+$ psql
+postgres=# CREATE DATABASE "voyage";
+postgres=# CREATE USER "voyage" WITH PASSWORD 'password';
+postgres=# GRANT ALL PRIVILEGES ON DATABASE "voyage" TO "voyage";
+postgres=# \q
+```
+1. Get on the floor  
+2. Walk the dinosaur  
+
+## Usage  
+`gunicorn voyage:init_app --bind localhost:5450 --worker-class aiohttp.GunicornWebWorker`  
--- a/config.py.template
+++ b/config.py.template
@ -0,0 +1,17 @@
+#!/usr/bin/env python3
+"""
+Configuration settings for Buckler.
+`server_domain` is the server's domain.
+`url_prefix` is the root path you wish app to reside at
+eg. https://example.com/voyage
+`db` specifies parameters for connecting to the PostgreSQL database.
+"""
+url_prefix = '/voyage'
+
+db = {
+	'database': 'voyage',
+	'user': 'voyage',
+	'password': """password""",
+	'host': 'localhost',
+	'port': 5432,
+}
--- a/scrape_quest.py
+++ b/scrape_quest.py
@ -0,0 +1,132 @@
+#!/usr/bin/env python3
+"""
+Extracts post data from a quest archive.
+"""
+import os
+import re
+import datetime
+
+import bs4
+import psycopg2
+
+import config
+
+BQ_DIR = '/var/www/html/banished/'
+QM_NAMES = ['Soma', 'Somas']
+
+con = psycopg2.connect(**config.db)
+cur = con.cursor()
+
+def init_db():
+	"""Initializes the database if it hasn't been already."""
+	with open('voyage.sql', 'r') as file:
+		cur.execute(file.read())
+
+
+def scrape_posts(root_dir):
+	"""Walks the `root_dir` and extracts data from index.html files found."""
+	for dir_name, sub_dirs, files in os.walk(root_dir):
+		for file in files:
+			if file != 'index.html':
+				continue
+			filepath = os.path.join(root_dir, dir_name, file)
+			print("Processing:", filepath)
+			with open(filepath, 'r') as file:
+				soup = bs4.BeautifulSoup(file.read(), 'html.parser')
+
+			mobiles = soup.find_all(class_='mobile')
+			for tag in mobiles:
+				tag.decompose()
+
+			thread_id = int(soup.find(class_='thread').get('id')[1:])
+			thread_title = soup.find(class_='subject').text
+			thread_time = int(soup.find(class_='dateTime').get('data-utc'))
+			thread_time = datetime.datetime.utcfromtimestamp(thread_time)
+			thread_time = thread_time.replace(tzinfo=datetime.timezone.utc)
+			cur.execute("INSERT INTO thread VALUES (%s,%s,%s)",
+				(thread_id, thread_title, thread_time)
+			)
+			posts = soup.find_all(class_='postContainer')
+			for post in posts:
+				# information gathering
+				post_id = int(post.get('id')[2:])
+				name = post.find(class_='name').text
+				trip_code = post.find(class_='postertrip')
+				if trip_code:
+					trip_code = trip_code.text
+				subject = post.find(class_='subject')
+				if subject:
+					subject = subject.text
+				post_time = int(post.find(class_='dateTime').get('data-utc'))
+				post_time = datetime.datetime.utcfromtimestamp(post_time)
+				post_time = post_time.replace(tzinfo=datetime.timezone.utc)
+				file_url = None
+				file_name = None
+				file_md5 = None
+				file_text = post.find(class_='fileText')
+				if file_text:
+					file_url = file_text.a.get('href')
+					file_name = file_text.a.text
+					file_md5 =post.find(class_='fileThumb').img.get('data-md5')
+				post_body = post.find(class_='postMessage').get_text('\n')
+
+				links = post.find_all(class_='quotelink')
+				links = [l for l in links if l.get('href').startswith('#')]
+				links = [int(link.text[2:]) for link in links]
+
+				# heuristics
+				tags = []
+				if name in QM_NAMES:
+					tags.append('qm_post')
+					# also counts shitposters and broken tripcodes
+					tags.append('story_post')
+					# assume every QM post is also a story post, until
+					# proven otherwise
+					if "dropped my trip" in post_body.lower():
+						for link in links:
+							cur.execute("INSERT INTO tag "
+								"VALUES (%s,%s), (%s,%s), (%s,%s)",
+								(link, 'qm_post',
+									link, 'story_post',
+									link, 'dropped_trip'
+								)
+							)
+							# dropped trip doesn't necessarily mean story_post
+						tags.append('dropped_trip')
+					if len(links) > 1:
+						tags.append('vote_tally_post')
+						# also counts Q&A posts
+						try:
+							tags.remove('story_post')
+						except ValueError:
+							pass
+					if posts.index(post) == 0:
+						tags.append('op_post')
+						if "Welcome to Banished Quest!" in post_body:
+							try:
+								tags.remove('story_post')
+							except ValueError:
+								pass
+
+				# database insert
+				cur.execute(
+					"INSERT INTO post VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)",
+					(thread_id, post_id, name, trip_code, subject,
+					post_time, file_url, file_name, file_md5, post_body)
+				)
+				for link in links:
+					cur.execute("INSERT INTO link VALUES (%s,%s)",
+						(post_id, link)
+					)
+				for tag in tags:
+					cur.execute("INSERT INTO tag VALUES (%s,%s)",
+						(post_id, tag)
+					)
+
+
+if __name__ == '__main__':
+	init_db()
+	scrape_posts(os.path.join(BQ_DIR, 'archive'))
+	scrape_posts(os.path.join(BQ_DIR, 'qstarchive'))
+	con.commit()
+	con.close()
--- a/static/voyage.css
+++ b/static/voyage.css
--- a/static/voyage.js
+++ b/static/voyage.js
--- a/templates/index.html
+++ b/templates/index.html
@ -0,0 +1,22 @@
+<!DOCTYPE html>
+<html lang="en">
+	<head>
+		<title>Voyage</title>
+		<link rel="stylesheet" type="text/css" href="/static/voyage.css">
+		<script type="text/javascript" src="/static/voyage.js"></script>
+		<meta name="viewport" content="width=device-width, initial-scale=0.8">
+		<meta name="description" content="A quest archive viewer.">
+	</head>
+	<body>
+		<header>
+			<h1>Voyage</h1>
+		</header>
+		<main>
+			<ul>
+				{% for thread in threads %}
+				<li>{{ thread.title }}</li>
+				{% endfor %}
+			</ul>
+		</main>
+	</body>
+</html>
--- a/voyage.py
+++ b/voyage.py
@ -0,0 +1,46 @@
+#!/usr/bin/env python3
+"""
+A display interface for archived 4chan quest threads.
+"""
+from aiohttp import web
+import jinja2
+import aiohttp_jinja2
+from aiohttp_jinja2 import render_template
+import asyncpg
+import uvloop
+
+import config
+
+uvloop.install()
+routes = web.RouteTableDef()
+
+@routes.get('/', name='index')
+async def index(request):
+	"""The index page."""
+	async with request.app['pool'].acquire() as conn:
+		threads = await conn.fetch("SELECT * FROM thread ORDER BY time ASC")
+	return render_template("index.html", request, locals())
+
+
+async def init_app():
+	"""Initializes the application."""
+	app = web.Application()
+	aiohttp_jinja2.setup(
+		app,
+		trim_blocks=True,
+		lstrip_blocks=True,
+		undefined=jinja2.StrictUndefined,
+		loader=jinja2.FileSystemLoader('templates'),
+	)
+	app['pool'] = await asyncpg.create_pool(**config.db)
+
+	app.router.add_routes(routes)
+
+	app_wrap = web.Application()
+	app_wrap.add_subapp(config.url_prefix, app)
+	return app_wrap
+
+
+if __name__ == "__main__":
+	app = init_app()
+	web.run_app(app, host='0.0.0.0', port=5450)
--- a/voyage.sql
+++ b/voyage.sql
@ -0,0 +1,28 @@
+CREATE TABLE IF NOT EXISTS thread(
+	id INTEGER PRIMARY KEY,
+	title TEXT NOT NULL,
+	time TIMESTAMP WITH TIME ZONE NOT NULL
+);
+
+CREATE TABLE IF NOT EXISTS post(
+	thread_id INTEGER REFERENCES thread(id) ON DELETE CASCADE NOT NULL,
+	id INTEGER PRIMARY KEY,
+	name TEXT NOT NULL,
+	trip_code TEXT,
+	subject TEXT,
+	time TIMESTAMP WITH TIME ZONE NOT NULL,
+	file_url TEXT,
+	file_name TEXT,
+	file_md5 TEXT,
+	body TEXT NOT NULL
+);
+
+CREATE TABLE IF NOT EXISTS link(
+	link_from INTEGER REFERENCES post(id) ON DELETE CASCADE NOT NULL,
+	link_to INTEGER REFERENCES post(id) ON DELETE CASCADE NOT NULL
+);
+
+CREATE TABLE IF NOT EXISTS tag(
+	post_id INTEGER REFERENCES post(id) ON DELETE CASCADE NOT NULL,
+	name TEXT NOT NULL
+);