#!/usr/bin/env python3 """ Scrapes all images/webms from a 4chan thread. """ import os import re from urllib.parse import urlparse import bs4 import requests HEADERS = {'User-Agent': 'bix nood gibs me dem maymays'} def Scrape(thread_url, output=None, original_filename=False, combo=False): """ Downloads thread page, extracts file urls and saves them to a directory. """ thread_url = re.sub(r"\/(?:#.+)?$", "", thread_url) # regex removes trailing backslashes and post number anchors s = requests.Session() s.headers.update(HEADERS) print('Downloading page: ' + thread_url) res = s.get(thread_url, verify=True) res.raise_for_status() soup = bs4.BeautifulSoup(res.text, "html.parser") if output == None: output = os.getcwd() dirName = [urlparse(thread_url).path.split("/")[1]] dirName += [os.path.split(thread_url)[1]] subject = soup.find(class_="subject") if subject: subject = subject.text else: subject = soup.find(class_="post_title") if subject: subject = subject.text dirName += [subject] dirName = "-".join(dirName) output = os.path.join(output, dirName) print("Saving to: " + output) os.makedirs(output, exist_ok=True) existingFiles = os.listdir(output) imgPosts = soup.find_all(class_ = "fileText") if not imgPosts: imgPosts = soup.find_all("a", {"class": "btnr parent", "download": re.compile(r'.*')}) for post in imgPosts: try: imgUrl = post.find('a').get('href') except AttributeError: imgUrl = post.get("href") if not imgUrl: print("File Deleted") continue if imgUrl[0] == r'/': imgUrl = 'https:' + imgUrl if os.path.basename(imgUrl) in existingFiles: print(os.path.basename(imgUrl) + " already present.") continue print("Downloading URL:", imgUrl) res = s.get(imgUrl, verify=True) if res.status_code == 404: print("404: Not Found") continue unix = os.path.basename(imgUrl) try: fname = post.find('a').get('title') except AttributeError: fname = post.get("download") if original_filename: if os.path.exists(os.path.join(output, fname)): print("Filename collision") fname = os.path.splitext(fname)[0] fname += "_" + unix elif combo: fname = os.path.splitext(unix)[0] + "_" + fname else: fname = unix # save the image with open(os.path.join(output, fname), 'wb') as imgFile: for chunk in res.iter_content(100000): imgFile.write(chunk) if __name__ == "__main__": import argparse parser = argparse.ArgumentParser( description="Downloads all images from the specified 4chan thread.") parser.add_argument( "thread_url", help="The url of the desired thread.") parser.add_argument( "-o", "--output", help="The directory to save images into. Default is " \ + "./[board]-[thread no.]-[subject field].") parser.add_argument( "-f", "--original-filename", action="store_true", help="Saves the files using the original filename.") parser.add_argument( "-c", "--combo", action="store_true", help="Saves the files using a combination of the 4chan filename and " \ + "the original filename.") args = parser.parse_args() Scrape(args.thread_url, args.output, args.original_filename, args.combo)