first commit

2018-03-04 21:52:35 -05:00 · 2018-03-04 21:52:35 -05:00 · 63aca6bb04
commit 63aca6bb04
2 changed files with 164 additions and 0 deletions
--- a/4chanScrape.py
+++ b/4chanScrape.py
@ -0,0 +1,163 @@
+#!/usr/bin/env python3
+"""
+Scrapes all images/webms from a 4chan thread.
+TODO: Add support for more image boards.
+"""
+import os
+from urllib.parse import urlparse
+import re
+
+import requests
+import bs4
+
+HEADERS = {'User-Agent': 'bix nood gibs me dem maymays'}
+
+def defaultOutput(thread_url, soup):
+	"""
+	Parses the thread title, url and subject field (if present) to create
+	a suitable directory name to save files to.
+	Format: [board]-[thread no.]-[subject field]
+	"""
+	# TODO: remove the need for thread_url to be passed
+	output = thread_url.split('/')[3] + '-' + os.path.split(thread_url)[1]
+	if soup.find(class_ = 'subject'):
+		output = output + '-' + soup.find(class_ = 'subject').text
+	output = output.replace("/","")
+	output = output.replace("...","")
+	return output
+
+
+def extract4chanFiles(soup):
+	"""
+	Extracts all file urls from the provided html-soup object and returns
+	a list on urls. 4chan only.
+	"""
+	imgPosts = soup.find_all(class_ = "fileText")
+	urls = []
+	for post in imgPosts:
+		url = post.find('a').get('href')
+		fname = post.find('a').get('title')
+		if not fname:
+			fname = post.find('a').text
+		urls.append((url, fname))
+	return urls
+
+
+def extractFoolFuukaFiles(soup):
+	"""
+	Extracts all file urls from the provided html-soup object and returns
+	a list on urls. FoolFuuka only.
+	"""
+	imgPosts = soup.find_all("a", {"class": "btnr parent",
+		"download": re.compile(r'.*')})
+	urls = []
+	for post in imgPosts:
+		url = post.get("href")
+		if not urlparse(url).scheme:
+			url = "http:" + url
+		fname = post.get("download")
+		urls.append((url, fname))
+	return urls
+
+
+def Scrape(thread_url, output=None, path=os.getcwd(), original_fname=False, combo=False):
+	"""
+	Downloads thread page, extracts file urls and saves them to a directory.
+	"""
+	boards = {"boards.4chan.org": extract4chanFiles,
+		"boards.fireden.net": extractFoolFuukaFiles,
+		"archive.4plebs.org": extractFoolFuukaFiles}
+	domain = urlparse(thread_url).netloc
+	if domain not in boards:
+		print("Unknown URL. Exiting.")
+		return
+
+	s = requests.Session()
+	s.headers.update(HEADERS)
+
+	print('Downloading page: ' + thread_url)
+	res = s.get(thread_url, verify=True)
+	res.raise_for_status()
+	soup = bs4.BeautifulSoup(res.text, "html.parser")
+
+	if output == None:
+		output = defaultOutput(thread_url, soup)
+	directory = os.path.join(path, output)
+	os.makedirs(directory, exist_ok=True)
+	existingFiles = os.listdir(directory)
+
+	urls = boards[domain](soup)
+	if urls == None:
+		print("Unknown URL. Exiting.")
+		return
+
+	print("Saving to: " + output)
+	for imgUrl, fname in urls:
+		if not imgUrl:
+			print("File Deleted")
+			continue
+		if imgUrl[0] == r'/':
+			imgUrl = 'https:' + imgUrl
+
+		if os.path.basename(imgUrl) in existingFiles:
+			print(os.path.basename(imgUrl) + " already present.")
+			continue
+		print("Downloading URL:", imgUrl)
+		res = s.get(imgUrl, verify=True)
+
+		if res.status_code == 404:
+			print("404: Not Found")
+			continue
+
+		unix = os.path.basename(imgUrl)
+		if original_fname:
+			if os.path.exists(os.path.join(directory, fname)):
+				print("Filename collision")
+				fname = os.path.splitext(fname)[0]
+				fname += "_" + unix
+		elif combo:
+			fname = os.path.splitext(unix)[0] + "_" + fname
+		else:
+			fname = unix
+		# save the image
+		with open(os.path.join(directory, fname), 'wb') as imgFile:
+			for chunk in res.iter_content(100000):
+				imgFile.write(chunk)
+
+
+if __name__ == "__main__":
+	import argparse
+
+	parser = argparse.ArgumentParser(
+		description="Downloads all images from the specified 4chan thread.")
+	parser.add_argument(
+		"thread_url",
+		help="The url of the desired thread.")
+	parser.add_argument(
+		"-o",
+		"--output",
+		default=None,
+		help="Title of directory to save images into. Default is \
+		[board]-[thread no.]-[subject field].")
+	parser.add_argument(
+		"-p",
+		"--path",
+		default=os.getcwd(),
+		help="Full path to the base directory to save gallery into. Default \
+		is current directory.")
+	parser.add_argument(
+		"-f",
+		"--orginal-filename",
+		dest="original_fname",
+		action="store_true",
+		default=False,
+		help="Saves the files using the original filename.")
+	parser.add_argument(
+		"-c",
+		"--combo",
+		action="store_true",
+		help="Saves the files using a combination of the 4chan filename and \
+		the original filename.")
+	args = parser.parse_args()
+
+	Scrape(**vars(args))
--- a/README.md
+++ b/README.md
@ -0,0 +1 @@
+Scrape 4chan.