From 8385103796e0bdb6281c4c8303cc2862b218252e Mon Sep 17 00:00:00 2001 From: iou1name Date: Mon, 20 Jan 2020 13:18:47 -0500 Subject: [PATCH] add email and id fields --- scrape_quest.py | 20 +++++++++++++------- voyage.sql | 6 ++++-- 2 files changed, 17 insertions(+), 9 deletions(-) diff --git a/scrape_quest.py b/scrape_quest.py index 0aaada1..b9e5308 100644 --- a/scrape_quest.py +++ b/scrape_quest.py @@ -52,13 +52,19 @@ def scrape_posts(root_dir): for post in posts: # information gathering post_id = int(post.get('id')[2:]) - name = post.find(class_='name').text - tripcode = post.find(class_='postertrip') - if tripcode: - tripcode = tripcode.text subject = post.find(class_='subject') if subject: subject = subject.text + name = post.find(class_='name').text + user_email = post.find(class_='useremail') + if user_email: + user_email = user_email.get('href') + tripcode = post.find(class_='postertrip') + if tripcode: + tripcode = tripcode.text + user_id = post.find(class_='hand') + if user_id: + user_id = user_id.text post_time = int(post.find(class_='dateTime').get('data-utc')) post_time = datetime.datetime.utcfromtimestamp(post_time) post_time = post_time.replace(tzinfo=datetime.timezone.utc) @@ -124,9 +130,9 @@ def scrape_posts(root_dir): # database insert cur.execute( - "INSERT INTO post VALUES (%s,%s,%s,%s,%s,%s,%s)", - (thread_id, post_id, name, tripcode, subject, - post_time, post_body) + "INSERT INTO post VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s)", + (thread_id, post_id, subject, name, user_email, tripcode, + user_id, post_time, post_body) ) for link in links: cur.execute("INSERT INTO link VALUES (%s,%s)", diff --git a/voyage.sql b/voyage.sql index e36ec8a..a08e960 100644 --- a/voyage.sql +++ b/voyage.sql @@ -7,9 +7,11 @@ CREATE TABLE IF NOT EXISTS thread ( CREATE TABLE IF NOT EXISTS post ( thread_id INTEGER REFERENCES thread(id) ON DELETE CASCADE NOT NULL, id INTEGER PRIMARY KEY, - name TEXT NOT NULL, - tripcode TEXT, subject TEXT, + name TEXT NOT NULL, + email TEXT, + tripcode TEXT, + user_id TEXT, time TIMESTAMP WITH TIME ZONE NOT NULL, body TEXT NOT NULL );