4chanScraper/4chanScrape.py

#!/usr/bin/env python3
"""
Scrapes all images/webms from a 4chan thread.
TODO: Add support for more image boards.
"""
import os
import re
from urllib.parse import urlparse

import bs4
import requests

HEADERS = {'User-Agent': 'bix nood gibs me dem maymays'}


def Scrape(thread_url, output=None, original_filename=False, combo=False):
	"""
	Downloads thread page, extracts file urls and saves them to a directory.
	"""
	thread_url = re.sub(r"/?#.*$", "", thread_url)

	s = requests.Session()
	s.headers.update(HEADERS)

	print('Downloading page: ' + thread_url)
	res = s.get(thread_url, verify=True)
	res.raise_for_status()
	soup = bs4.BeautifulSoup(res.text, "html.parser")

	if output == None:
		output = os.getcwd()
		dirName = [urlparse(thread_url).path.split("/")[1]]
		dirName += [os.path.split(thread_url)[1]]
		subject = soup.find(class_="subject")
		if subject:
			subject = subject.text
		else:
			subject = soup.find(class_="post_title")
			if subject:
				subject = subject.text
			else:
				subject = ""
		dirName += [subject]
		dirName = "-".join(dirName)
		output = os.path.join(output, dirName)

	print("Saving to: " + output)
	os.makedirs(output, exist_ok=True)
	existingFiles = os.listdir(output)

	imgPosts = soup.find_all(class_ = "fileText")
	if not imgPosts:
		imgPosts = soup.find_all("a",
			{"class": "btnr parent", "download": re.compile(r'.*')})

	for post in imgPosts:
		try:
			imgUrl = post.find('a').get('href')
		except AttributeError:
			imgUrl = post.get("href")

		if not imgUrl:
			print("File Deleted")
			continue
		if imgUrl[0] == r'/':
			imgUrl = 'https:' + imgUrl

		if os.path.basename(imgUrl) in existingFiles:
			print(os.path.basename(imgUrl) + " already present.")
			continue
		print("Downloading URL:", imgUrl)
		res = s.get(imgUrl, verify=True)

		if res.status_code == 404:
			print("404: Not Found")
			continue

		unix = os.path.basename(imgUrl)
		try:
			fname = post.find('a').get('title')
		except AttributeError:
			fname = post.get("download")
		if original_filename:
			if os.path.exists(os.path.join(output, fname)):
				print("Filename collision")
				fname = os.path.splitext(fname)[0]
				fname += "_" + unix
		elif combo:
			fname = os.path.splitext(unix)[0] + "_" + fname
		else:
			fname = unix
		# save the image
		with open(os.path.join(output, fname), 'wb') as imgFile:
			for chunk in res.iter_content(100000):
				imgFile.write(chunk)


if __name__ == "__main__":
	import argparse

	parser = argparse.ArgumentParser(
		description="Downloads all images from the specified 4chan thread.")
	parser.add_argument(
		"thread_url",
		help="The url of the desired thread.")
	parser.add_argument(
		"-o",
		"--output",
		help="The directory to save images into. Default is " \
		+ "./[board]-[thread no.]-[subject field].")
	parser.add_argument(
		"-f",
		"--original-filename",
		action="store_true",
		help="Saves the files using the original filename.")
	parser.add_argument(
		"-c",
		"--combo",
		action="store_true",
		help="Saves the files using a combination of the 4chan filename and " \
		+ "the original filename.")
	args = parser.parse_args()

	Scrape(args.thread_url, args.output, args.original_filename, args.combo)
first commit 2018-03-04 21:52:35 -05:00			`#!/usr/bin/env python3`
			`"""`
			`Scrapes all images/webms from a 4chan thread.`
			`TODO: Add support for more image boards.`
			`"""`
			`import os`
			`import re`
rewrote most of it 2018-03-21 23:23:26 -04:00			`from urllib.parse import urlparse`
first commit 2018-03-04 21:52:35 -05:00
			`import bs4`
rewrote most of it 2018-03-21 23:23:26 -04:00			`import requests`
first commit 2018-03-04 21:52:35 -05:00
			`HEADERS = {'User-Agent': 'bix nood gibs me dem maymays'}`


rewrote most of it 2018-03-21 23:23:26 -04:00			`def Scrape(thread_url, output=None, original_filename=False, combo=False):`
first commit 2018-03-04 21:52:35 -05:00			`"""`
			`Downloads thread page, extracts file urls and saves them to a directory.`
			`"""`
rewrote most of it 2018-03-21 23:23:26 -04:00			`thread_url = re.sub(r"/?#.*$", "", thread_url)`
first commit 2018-03-04 21:52:35 -05:00
			`s = requests.Session()`
			`s.headers.update(HEADERS)`

			`print('Downloading page: ' + thread_url)`
			`res = s.get(thread_url, verify=True)`
			`res.raise_for_status()`
			`soup = bs4.BeautifulSoup(res.text, "html.parser")`

			`if output == None:`
rewrote most of it 2018-03-21 23:23:26 -04:00			`output = os.getcwd()`
			`dirName = [urlparse(thread_url).path.split("/")[1]]`
			`dirName += [os.path.split(thread_url)[1]]`
			`subject = soup.find(class_="subject")`
			`if subject:`
			`subject = subject.text`
			`else:`
			`subject = soup.find(class_="post_title")`
			`if subject:`
			`subject = subject.text`
			`else:`
			`subject = ""`
			`dirName += [subject]`
			`dirName = "-".join(dirName)`
			`output = os.path.join(output, dirName)`
first commit 2018-03-04 21:52:35 -05:00
			`print("Saving to: " + output)`
rewrote most of it 2018-03-21 23:23:26 -04:00			`os.makedirs(output, exist_ok=True)`
			`existingFiles = os.listdir(output)`

			`imgPosts = soup.find_all(class_ = "fileText")`
			`if not imgPosts:`
			`imgPosts = soup.find_all("a",`
			`{"class": "btnr parent", "download": re.compile(r'.*')})`

			`for post in imgPosts:`
			`try:`
			`imgUrl = post.find('a').get('href')`
			`except AttributeError:`
			`imgUrl = post.get("href")`

first commit 2018-03-04 21:52:35 -05:00			`if not imgUrl:`
			`print("File Deleted")`
			`continue`
			`if imgUrl[0] == r'/':`
			`imgUrl = 'https:' + imgUrl`

			`if os.path.basename(imgUrl) in existingFiles:`
			`print(os.path.basename(imgUrl) + " already present.")`
			`continue`
			`print("Downloading URL:", imgUrl)`
			`res = s.get(imgUrl, verify=True)`

			`if res.status_code == 404:`
			`print("404: Not Found")`
			`continue`

			`unix = os.path.basename(imgUrl)`
rewrote most of it 2018-03-21 23:23:26 -04:00			`try:`
			`fname = post.find('a').get('title')`
			`except AttributeError:`
			`fname = post.get("download")`
			`if original_filename:`
			`if os.path.exists(os.path.join(output, fname)):`
first commit 2018-03-04 21:52:35 -05:00			`print("Filename collision")`
			`fname = os.path.splitext(fname)[0]`
			`fname += "_" + unix`
			`elif combo:`
			`fname = os.path.splitext(unix)[0] + "_" + fname`
			`else:`
			`fname = unix`
			`# save the image`
rewrote most of it 2018-03-21 23:23:26 -04:00			`with open(os.path.join(output, fname), 'wb') as imgFile:`
first commit 2018-03-04 21:52:35 -05:00			`for chunk in res.iter_content(100000):`
			`imgFile.write(chunk)`


			`if __name__ == "__main__":`
			`import argparse`

			`parser = argparse.ArgumentParser(`
			`description="Downloads all images from the specified 4chan thread.")`
			`parser.add_argument(`
			`"thread_url",`
			`help="The url of the desired thread.")`
			`parser.add_argument(`
			`"-o",`
			`"--output",`
rewrote most of it 2018-03-21 23:23:26 -04:00			`help="The directory to save images into. Default is " \`
			`+ "./[board]-[thread no.]-[subject field].")`
first commit 2018-03-04 21:52:35 -05:00			`parser.add_argument(`
			`"-f",`
rewrote most of it 2018-03-21 23:23:26 -04:00			`"--original-filename",`
first commit 2018-03-04 21:52:35 -05:00			`action="store_true",`
			`help="Saves the files using the original filename.")`
			`parser.add_argument(`
			`"-c",`
			`"--combo",`
			`action="store_true",`
rewrote most of it 2018-03-21 23:23:26 -04:00			`help="Saves the files using a combination of the 4chan filename and " \`
			`+ "the original filename.")`
first commit 2018-03-04 21:52:35 -05:00			`args = parser.parse_args()`

rewrote most of it 2018-03-21 23:23:26 -04:00			`Scrape(args.thread_url, args.output, args.original_filename, args.combo)`