add email and id fields

This commit is contained in:
iou1name 2020-01-20 13:18:47 -05:00
parent 2c6adac225
commit 8385103796
2 changed files with 17 additions and 9 deletions

View File

@ -52,13 +52,19 @@ def scrape_posts(root_dir):
for post in posts: for post in posts:
# information gathering # information gathering
post_id = int(post.get('id')[2:]) post_id = int(post.get('id')[2:])
name = post.find(class_='name').text
tripcode = post.find(class_='postertrip')
if tripcode:
tripcode = tripcode.text
subject = post.find(class_='subject') subject = post.find(class_='subject')
if subject: if subject:
subject = subject.text subject = subject.text
name = post.find(class_='name').text
user_email = post.find(class_='useremail')
if user_email:
user_email = user_email.get('href')
tripcode = post.find(class_='postertrip')
if tripcode:
tripcode = tripcode.text
user_id = post.find(class_='hand')
if user_id:
user_id = user_id.text
post_time = int(post.find(class_='dateTime').get('data-utc')) post_time = int(post.find(class_='dateTime').get('data-utc'))
post_time = datetime.datetime.utcfromtimestamp(post_time) post_time = datetime.datetime.utcfromtimestamp(post_time)
post_time = post_time.replace(tzinfo=datetime.timezone.utc) post_time = post_time.replace(tzinfo=datetime.timezone.utc)
@ -124,9 +130,9 @@ def scrape_posts(root_dir):
# database insert # database insert
cur.execute( cur.execute(
"INSERT INTO post VALUES (%s,%s,%s,%s,%s,%s,%s)", "INSERT INTO post VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s)",
(thread_id, post_id, name, tripcode, subject, (thread_id, post_id, subject, name, user_email, tripcode,
post_time, post_body) user_id, post_time, post_body)
) )
for link in links: for link in links:
cur.execute("INSERT INTO link VALUES (%s,%s)", cur.execute("INSERT INTO link VALUES (%s,%s)",

View File

@ -7,9 +7,11 @@ CREATE TABLE IF NOT EXISTS thread (
CREATE TABLE IF NOT EXISTS post ( CREATE TABLE IF NOT EXISTS post (
thread_id INTEGER REFERENCES thread(id) ON DELETE CASCADE NOT NULL, thread_id INTEGER REFERENCES thread(id) ON DELETE CASCADE NOT NULL,
id INTEGER PRIMARY KEY, id INTEGER PRIMARY KEY,
name TEXT NOT NULL,
tripcode TEXT,
subject TEXT, subject TEXT,
name TEXT NOT NULL,
email TEXT,
tripcode TEXT,
user_id TEXT,
time TIMESTAMP WITH TIME ZONE NOT NULL, time TIMESTAMP WITH TIME ZONE NOT NULL,
body TEXT NOT NULL body TEXT NOT NULL
); );