first commit

2018-03-04 21:52:35 -05:00 · 2018-03-04 21:52:35 -05:00 · 63aca6bb04
commit 63aca6bb04
2 changed files with 164 additions and 0 deletions
--- a/4chanScrape.py
+++ b/4chanScrape.py
@ -0,0 +1,163 @@
 #!/usr/bin/env python3
 """
 Scrapes all images/webms from a 4chan thread.
 TODO: Add support for more image boards.
 """
 import os
 from urllib.parse import urlparse
 import re
 import requests
 import bs4
 HEADERS = {'User-Agent': 'bix nood gibs me dem maymays'}
 def defaultOutput(thread_url, soup):
 	"""
 	Parses the thread title, url and subject field (if present) to create
 	a suitable directory name to save files to.
 	Format: [board]-[thread no.]-[subject field]
 	"""
 	# TODO: remove the need for thread_url to be passed
 	output = thread_url.split('/')[3] + '-' + os.path.split(thread_url)[1]
 	if soup.find(class_ = 'subject'):
 		output = output + '-' + soup.find(class_ = 'subject').text
 	output = output.replace("/","")
 	output = output.replace("...","")
 	return output
 def extract4chanFiles(soup):
 	"""
 	Extracts all file urls from the provided html-soup object and returns
 	a list on urls. 4chan only.
 	"""
 	imgPosts = soup.find_all(class_ = "fileText")
 	urls = []
 	for post in imgPosts:
 		url = post.find('a').get('href')
 		fname = post.find('a').get('title')
 		if not fname:
 			fname = post.find('a').text
 		urls.append((url, fname))
 	return urls
 def extractFoolFuukaFiles(soup):
 	"""
 	Extracts all file urls from the provided html-soup object and returns
 	a list on urls. FoolFuuka only.
 	"""
 	imgPosts = soup.find_all("a", {"class": "btnr parent",
 		"download": re.compile(r'.*')})
 	urls = []
 	for post in imgPosts:
 		url = post.get("href")
 		if not urlparse(url).scheme:
 			url = "http:" + url
 		fname = post.get("download")
 		urls.append((url, fname))
 	return urls
 def Scrape(thread_url, output=None, path=os.getcwd(), original_fname=False, combo=False):
 	"""
 	Downloads thread page, extracts file urls and saves them to a directory.
 	"""
 	boards = {"boards.4chan.org": extract4chanFiles,
 		"boards.fireden.net": extractFoolFuukaFiles,
 		"archive.4plebs.org": extractFoolFuukaFiles}
 	domain = urlparse(thread_url).netloc
 	if domain not in boards:
 		print("Unknown URL. Exiting.")
 		return
 	s = requests.Session()
 	s.headers.update(HEADERS)
 	print('Downloading page: ' + thread_url)
 	res = s.get(thread_url, verify=True)
 	res.raise_for_status()
 	soup = bs4.BeautifulSoup(res.text, "html.parser")
 	if output == None:
 		output = defaultOutput(thread_url, soup)
 	directory = os.path.join(path, output)
 	os.makedirs(directory, exist_ok=True)
 	existingFiles = os.listdir(directory)
 	urls = boards[domain](soup)
 	if urls == None:
 		print("Unknown URL. Exiting.")
 		return
 	print("Saving to: " + output)
 	for imgUrl, fname in urls:
 		if not imgUrl:
 			print("File Deleted")
 			continue
 		if imgUrl[0] == r'/':
 			imgUrl = 'https:' + imgUrl
 		if os.path.basename(imgUrl) in existingFiles:
 			print(os.path.basename(imgUrl) + " already present.")
 			continue
 		print("Downloading URL:", imgUrl)
 		res = s.get(imgUrl, verify=True)
 		if res.status_code == 404:
 			print("404: Not Found")
 			continue
 		unix = os.path.basename(imgUrl)
 		if original_fname:
 			if os.path.exists(os.path.join(directory, fname)):
 				print("Filename collision")
 				fname = os.path.splitext(fname)[0]
 				fname += "_" + unix
 		elif combo:
 			fname = os.path.splitext(unix)[0] + "_" + fname
 		else:
 			fname = unix
 		# save the image
 		with open(os.path.join(directory, fname), 'wb') as imgFile:
 			for chunk in res.iter_content(100000):
 				imgFile.write(chunk)
 if __name__ == "__main__":
 	import argparse
 	parser = argparse.ArgumentParser(
 		description="Downloads all images from the specified 4chan thread.")
 	parser.add_argument(
 		"thread_url",
 		help="The url of the desired thread.")
 	parser.add_argument(
 		"-o",
 		"--output",
 		default=None,
 		help="Title of directory to save images into. Default is \
 		[board]-[thread no.]-[subject field].")
 	parser.add_argument(
 		"-p",
 		"--path",
 		default=os.getcwd(),
 		help="Full path to the base directory to save gallery into. Default \
 		is current directory.")
 	parser.add_argument(
 		"-f",
 		"--orginal-filename",
 		dest="original_fname",
 		action="store_true",
 		default=False,
 		help="Saves the files using the original filename.")
 	parser.add_argument(
 		"-c",
 		"--combo",
 		action="store_true",
 		help="Saves the files using a combination of the 4chan filename and \
 		the original filename.")
 	args = parser.parse_args()
 	Scrape(**vars(args))
--- a/README.md
+++ b/README.md
@ -0,0 +1 @@
 Scrape 4chan.