#!/usr/bin/env python3 """ Scrapes all images/webms from a 4chan thread. TODO: Add support for more image boards. """ import os from urllib.parse import urlparse import re import requests import bs4 HEADERS = {'User-Agent': 'bix nood gibs me dem maymays'} def defaultOutput(thread_url, soup): """ Parses the thread title, url and subject field (if present) to create a suitable directory name to save files to. Format: [board]-[thread no.]-[subject field] """ # TODO: remove the need for thread_url to be passed output = thread_url.split('/')[3] + '-' + os.path.split(thread_url)[1] if soup.find(class_ = 'subject'): output = output + '-' + soup.find(class_ = 'subject').text output = output.replace("/","") output = output.replace("...","") return output def extract4chanFiles(soup): """ Extracts all file urls from the provided html-soup object and returns a list on urls. 4chan only. """ imgPosts = soup.find_all(class_ = "fileText") urls = [] for post in imgPosts: url = post.find('a').get('href') fname = post.find('a').get('title') if not fname: fname = post.find('a').text urls.append((url, fname)) return urls def extractFoolFuukaFiles(soup): """ Extracts all file urls from the provided html-soup object and returns a list on urls. FoolFuuka only. """ imgPosts = soup.find_all("a", {"class": "btnr parent", "download": re.compile(r'.*')}) urls = [] for post in imgPosts: url = post.get("href") if not urlparse(url).scheme: url = "http:" + url fname = post.get("download") urls.append((url, fname)) return urls def Scrape(thread_url, output=None, path=os.getcwd(), original_fname=False, combo=False): """ Downloads thread page, extracts file urls and saves them to a directory. """ boards = {"boards.4chan.org": extract4chanFiles, "boards.fireden.net": extractFoolFuukaFiles, "archive.4plebs.org": extractFoolFuukaFiles} domain = urlparse(thread_url).netloc if domain not in boards: print("Unknown URL. Exiting.") return s = requests.Session() s.headers.update(HEADERS) print('Downloading page: ' + thread_url) res = s.get(thread_url, verify=True) res.raise_for_status() soup = bs4.BeautifulSoup(res.text, "html.parser") if output == None: output = defaultOutput(thread_url, soup) directory = os.path.join(path, output) os.makedirs(directory, exist_ok=True) existingFiles = os.listdir(directory) urls = boards[domain](soup) if urls == None: print("Unknown URL. Exiting.") return print("Saving to: " + output) for imgUrl, fname in urls: if not imgUrl: print("File Deleted") continue if imgUrl[0] == r'/': imgUrl = 'https:' + imgUrl if os.path.basename(imgUrl) in existingFiles: print(os.path.basename(imgUrl) + " already present.") continue print("Downloading URL:", imgUrl) res = s.get(imgUrl, verify=True) if res.status_code == 404: print("404: Not Found") continue unix = os.path.basename(imgUrl) if original_fname: if os.path.exists(os.path.join(directory, fname)): print("Filename collision") fname = os.path.splitext(fname)[0] fname += "_" + unix elif combo: fname = os.path.splitext(unix)[0] + "_" + fname else: fname = unix # save the image with open(os.path.join(directory, fname), 'wb') as imgFile: for chunk in res.iter_content(100000): imgFile.write(chunk) if __name__ == "__main__": import argparse parser = argparse.ArgumentParser( description="Downloads all images from the specified 4chan thread.") parser.add_argument( "thread_url", help="The url of the desired thread.") parser.add_argument( "-o", "--output", default=None, help="Title of directory to save images into. Default is \ [board]-[thread no.]-[subject field].") parser.add_argument( "-p", "--path", default=os.getcwd(), help="Full path to the base directory to save gallery into. Default \ is current directory.") parser.add_argument( "-f", "--orginal-filename", dest="original_fname", action="store_true", default=False, help="Saves the files using the original filename.") parser.add_argument( "-c", "--combo", action="store_true", help="Saves the files using a combination of the 4chan filename and \ the original filename.") args = parser.parse_args() Scrape(**vars(args))