4chanScraper/4chanScrape.py

#!/usr/bin/env python3
"""
Scrapes all images/webms from a 4chan thread.
"""
import os
import re
from urllib.parse import urlparse

import bs4
import requests

HEADERS = {'User-Agent': 'bix nood gibs me dem maymays'}


def Scrape(thread_url, output=None, original_filename=False, combo=False):
	"""
	Downloads thread page, extracts file urls and saves them to a directory.
	"""
	thread_url = re.sub(r"\/(?:#.+)?$", "", thread_url)
	# regex removes trailing backslashes and post number anchors

	s = requests.Session()
	s.headers.update(HEADERS)

	print('Downloading page: ' + thread_url)
	res = s.get(thread_url, verify=True)
	res.raise_for_status()
	soup = bs4.BeautifulSoup(res.text, "html.parser")

	if output == None:
		output = os.getcwd()
		dirName = [urlparse(thread_url).path.split("/")[1]]
		dirName += [os.path.split(thread_url)[1]]
		subject = soup.find(class_="subject")
		if subject:
			subject = subject.text
		else:
			subject = soup.find(class_="post_title")
			if subject:
				subject = subject.text
			else:
				subject = ""
		dirName += [subject]
		dirName = "-".join(dirName)
		output = os.path.join(output, dirName)

	print("Saving to: " + output)
	os.makedirs(output, exist_ok=True)
	existingFiles = os.listdir(output)

	imgPosts = soup.find_all(class_ = "fileText")
	if not imgPosts:
		imgPosts = soup.find_all("a",
			{"class": "btnr parent", "download": re.compile(r'.*')})

	for post in imgPosts:
		try:
			imgUrl = post.find('a').get('href')
		except AttributeError:
			imgUrl = post.get("href")

		if not imgUrl:
			print("File Deleted")
			continue
		if imgUrl[0] == r'/':
			imgUrl = 'https:' + imgUrl

		if os.path.basename(imgUrl) in existingFiles:
			print(os.path.basename(imgUrl) + " already present.")
			continue
		print("Downloading URL:", imgUrl)
		res = s.get(imgUrl, verify=True)

		if res.status_code == 404:
			print("404: Not Found")
			continue

		unix = os.path.basename(imgUrl)
		try:
			fname = post.find('a').get('title')
		except AttributeError:
			fname = post.get("download")
		if original_filename:
			if os.path.exists(os.path.join(output, fname)):
				print("Filename collision")
				fname = os.path.splitext(fname)[0]
				fname += "_" + unix
		elif combo:
			fname = os.path.splitext(unix)[0] + "_" + fname
		else:
			fname = unix
		# save the image
		with open(os.path.join(output, fname), 'wb') as imgFile:
			for chunk in res.iter_content(100000):
				imgFile.write(chunk)


if __name__ == "__main__":
	import argparse

	parser = argparse.ArgumentParser(
		description="Downloads all images from the specified 4chan thread.")
	parser.add_argument(
		"thread_url",
		help="The url of the desired thread.")
	parser.add_argument(
		"-o",
		"--output",
		help="The directory to save images into. Default is " \
		+ "./[board]-[thread no.]-[subject field].")
	parser.add_argument(
		"-f",
		"--original-filename",
		action="store_true",
		help="Saves the files using the original filename.")
	parser.add_argument(
		"-c",
		"--combo",
		action="store_true",
		help="Saves the files using a combination of the 4chan filename and " \
		+ "the original filename.")
	args = parser.parse_args()

	Scrape(args.thread_url, args.output, args.original_filename, args.combo)