2018-03-04 21:52:35 -05:00
|
|
|
#!/usr/bin/env python3
|
|
|
|
"""
|
|
|
|
Scrapes all images/webms from a 4chan thread.
|
|
|
|
TODO: Add support for more image boards.
|
|
|
|
"""
|
|
|
|
import os
|
|
|
|
import re
|
2018-03-21 23:23:26 -04:00
|
|
|
from urllib.parse import urlparse
|
2018-03-04 21:52:35 -05:00
|
|
|
|
|
|
|
import bs4
|
2018-03-21 23:23:26 -04:00
|
|
|
import requests
|
2018-03-04 21:52:35 -05:00
|
|
|
|
|
|
|
HEADERS = {'User-Agent': 'bix nood gibs me dem maymays'}
|
|
|
|
|
|
|
|
|
2018-03-21 23:23:26 -04:00
|
|
|
def Scrape(thread_url, output=None, original_filename=False, combo=False):
|
2018-03-04 21:52:35 -05:00
|
|
|
"""
|
|
|
|
Downloads thread page, extracts file urls and saves them to a directory.
|
|
|
|
"""
|
2018-03-21 23:23:26 -04:00
|
|
|
thread_url = re.sub(r"/?#.*$", "", thread_url)
|
2018-03-04 21:52:35 -05:00
|
|
|
|
|
|
|
s = requests.Session()
|
|
|
|
s.headers.update(HEADERS)
|
|
|
|
|
|
|
|
print('Downloading page: ' + thread_url)
|
|
|
|
res = s.get(thread_url, verify=True)
|
|
|
|
res.raise_for_status()
|
|
|
|
soup = bs4.BeautifulSoup(res.text, "html.parser")
|
|
|
|
|
|
|
|
if output == None:
|
2018-03-21 23:23:26 -04:00
|
|
|
output = os.getcwd()
|
|
|
|
dirName = [urlparse(thread_url).path.split("/")[1]]
|
|
|
|
dirName += [os.path.split(thread_url)[1]]
|
|
|
|
subject = soup.find(class_="subject")
|
|
|
|
if subject:
|
|
|
|
subject = subject.text
|
|
|
|
else:
|
|
|
|
subject = soup.find(class_="post_title")
|
|
|
|
if subject:
|
|
|
|
subject = subject.text
|
|
|
|
else:
|
|
|
|
subject = ""
|
|
|
|
dirName += [subject]
|
|
|
|
dirName = "-".join(dirName)
|
|
|
|
output = os.path.join(output, dirName)
|
2018-03-04 21:52:35 -05:00
|
|
|
|
|
|
|
print("Saving to: " + output)
|
2018-03-21 23:23:26 -04:00
|
|
|
os.makedirs(output, exist_ok=True)
|
|
|
|
existingFiles = os.listdir(output)
|
|
|
|
|
|
|
|
imgPosts = soup.find_all(class_ = "fileText")
|
|
|
|
if not imgPosts:
|
|
|
|
imgPosts = soup.find_all("a",
|
|
|
|
{"class": "btnr parent", "download": re.compile(r'.*')})
|
|
|
|
|
|
|
|
for post in imgPosts:
|
|
|
|
try:
|
|
|
|
imgUrl = post.find('a').get('href')
|
|
|
|
except AttributeError:
|
|
|
|
imgUrl = post.get("href")
|
|
|
|
|
2018-03-04 21:52:35 -05:00
|
|
|
if not imgUrl:
|
|
|
|
print("File Deleted")
|
|
|
|
continue
|
|
|
|
if imgUrl[0] == r'/':
|
|
|
|
imgUrl = 'https:' + imgUrl
|
|
|
|
|
|
|
|
if os.path.basename(imgUrl) in existingFiles:
|
|
|
|
print(os.path.basename(imgUrl) + " already present.")
|
|
|
|
continue
|
|
|
|
print("Downloading URL:", imgUrl)
|
|
|
|
res = s.get(imgUrl, verify=True)
|
|
|
|
|
|
|
|
if res.status_code == 404:
|
|
|
|
print("404: Not Found")
|
|
|
|
continue
|
|
|
|
|
|
|
|
unix = os.path.basename(imgUrl)
|
2018-03-21 23:23:26 -04:00
|
|
|
try:
|
|
|
|
fname = post.find('a').get('title')
|
|
|
|
except AttributeError:
|
|
|
|
fname = post.get("download")
|
|
|
|
if original_filename:
|
|
|
|
if os.path.exists(os.path.join(output, fname)):
|
2018-03-04 21:52:35 -05:00
|
|
|
print("Filename collision")
|
|
|
|
fname = os.path.splitext(fname)[0]
|
|
|
|
fname += "_" + unix
|
|
|
|
elif combo:
|
|
|
|
fname = os.path.splitext(unix)[0] + "_" + fname
|
|
|
|
else:
|
|
|
|
fname = unix
|
|
|
|
# save the image
|
2018-03-21 23:23:26 -04:00
|
|
|
with open(os.path.join(output, fname), 'wb') as imgFile:
|
2018-03-04 21:52:35 -05:00
|
|
|
for chunk in res.iter_content(100000):
|
|
|
|
imgFile.write(chunk)
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
import argparse
|
|
|
|
|
|
|
|
parser = argparse.ArgumentParser(
|
|
|
|
description="Downloads all images from the specified 4chan thread.")
|
|
|
|
parser.add_argument(
|
|
|
|
"thread_url",
|
|
|
|
help="The url of the desired thread.")
|
|
|
|
parser.add_argument(
|
|
|
|
"-o",
|
|
|
|
"--output",
|
2018-03-21 23:23:26 -04:00
|
|
|
help="The directory to save images into. Default is " \
|
|
|
|
+ "./[board]-[thread no.]-[subject field].")
|
2018-03-04 21:52:35 -05:00
|
|
|
parser.add_argument(
|
|
|
|
"-f",
|
2018-03-21 23:23:26 -04:00
|
|
|
"--original-filename",
|
2018-03-04 21:52:35 -05:00
|
|
|
action="store_true",
|
|
|
|
help="Saves the files using the original filename.")
|
|
|
|
parser.add_argument(
|
|
|
|
"-c",
|
|
|
|
"--combo",
|
|
|
|
action="store_true",
|
2018-03-21 23:23:26 -04:00
|
|
|
help="Saves the files using a combination of the 4chan filename and " \
|
|
|
|
+ "the original filename.")
|
2018-03-04 21:52:35 -05:00
|
|
|
args = parser.parse_args()
|
|
|
|
|
2018-03-21 23:23:26 -04:00
|
|
|
Scrape(args.thread_url, args.output, args.original_filename, args.combo)
|