commit 63aca6bb04b4e8e4b1886ea2efa161e3edff7e76 Author: iou1name Date: Sun Mar 4 21:52:35 2018 -0500 first commit diff --git a/4chanScrape.py b/4chanScrape.py new file mode 100755 index 0000000..28c41d8 --- /dev/null +++ b/4chanScrape.py @@ -0,0 +1,163 @@ +#!/usr/bin/env python3 +""" +Scrapes all images/webms from a 4chan thread. +TODO: Add support for more image boards. +""" +import os +from urllib.parse import urlparse +import re + +import requests +import bs4 + +HEADERS = {'User-Agent': 'bix nood gibs me dem maymays'} + +def defaultOutput(thread_url, soup): + """ + Parses the thread title, url and subject field (if present) to create + a suitable directory name to save files to. + Format: [board]-[thread no.]-[subject field] + """ + # TODO: remove the need for thread_url to be passed + output = thread_url.split('/')[3] + '-' + os.path.split(thread_url)[1] + if soup.find(class_ = 'subject'): + output = output + '-' + soup.find(class_ = 'subject').text + output = output.replace("/","") + output = output.replace("...","") + return output + + +def extract4chanFiles(soup): + """ + Extracts all file urls from the provided html-soup object and returns + a list on urls. 4chan only. + """ + imgPosts = soup.find_all(class_ = "fileText") + urls = [] + for post in imgPosts: + url = post.find('a').get('href') + fname = post.find('a').get('title') + if not fname: + fname = post.find('a').text + urls.append((url, fname)) + return urls + + +def extractFoolFuukaFiles(soup): + """ + Extracts all file urls from the provided html-soup object and returns + a list on urls. FoolFuuka only. + """ + imgPosts = soup.find_all("a", {"class": "btnr parent", + "download": re.compile(r'.*')}) + urls = [] + for post in imgPosts: + url = post.get("href") + if not urlparse(url).scheme: + url = "http:" + url + fname = post.get("download") + urls.append((url, fname)) + return urls + + +def Scrape(thread_url, output=None, path=os.getcwd(), original_fname=False, combo=False): + """ + Downloads thread page, extracts file urls and saves them to a directory. + """ + boards = {"boards.4chan.org": extract4chanFiles, + "boards.fireden.net": extractFoolFuukaFiles, + "archive.4plebs.org": extractFoolFuukaFiles} + domain = urlparse(thread_url).netloc + if domain not in boards: + print("Unknown URL. Exiting.") + return + + s = requests.Session() + s.headers.update(HEADERS) + + print('Downloading page: ' + thread_url) + res = s.get(thread_url, verify=True) + res.raise_for_status() + soup = bs4.BeautifulSoup(res.text, "html.parser") + + if output == None: + output = defaultOutput(thread_url, soup) + directory = os.path.join(path, output) + os.makedirs(directory, exist_ok=True) + existingFiles = os.listdir(directory) + + urls = boards[domain](soup) + if urls == None: + print("Unknown URL. Exiting.") + return + + print("Saving to: " + output) + for imgUrl, fname in urls: + if not imgUrl: + print("File Deleted") + continue + if imgUrl[0] == r'/': + imgUrl = 'https:' + imgUrl + + if os.path.basename(imgUrl) in existingFiles: + print(os.path.basename(imgUrl) + " already present.") + continue + print("Downloading URL:", imgUrl) + res = s.get(imgUrl, verify=True) + + if res.status_code == 404: + print("404: Not Found") + continue + + unix = os.path.basename(imgUrl) + if original_fname: + if os.path.exists(os.path.join(directory, fname)): + print("Filename collision") + fname = os.path.splitext(fname)[0] + fname += "_" + unix + elif combo: + fname = os.path.splitext(unix)[0] + "_" + fname + else: + fname = unix + # save the image + with open(os.path.join(directory, fname), 'wb') as imgFile: + for chunk in res.iter_content(100000): + imgFile.write(chunk) + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser( + description="Downloads all images from the specified 4chan thread.") + parser.add_argument( + "thread_url", + help="The url of the desired thread.") + parser.add_argument( + "-o", + "--output", + default=None, + help="Title of directory to save images into. Default is \ + [board]-[thread no.]-[subject field].") + parser.add_argument( + "-p", + "--path", + default=os.getcwd(), + help="Full path to the base directory to save gallery into. Default \ + is current directory.") + parser.add_argument( + "-f", + "--orginal-filename", + dest="original_fname", + action="store_true", + default=False, + help="Saves the files using the original filename.") + parser.add_argument( + "-c", + "--combo", + action="store_true", + help="Saves the files using a combination of the 4chan filename and \ + the original filename.") + args = parser.parse_args() + + Scrape(**vars(args)) diff --git a/README.md b/README.md new file mode 100644 index 0000000..9642bd1 --- /dev/null +++ b/README.md @@ -0,0 +1 @@ +Scrape 4chan.