rewrote most of it

2018-03-21 23:23:26 -04:00
1 changed files with 46 additions and 85 deletions
--- a/4chanScrape.py
+++ b/4chanScrape.py
@ -4,73 +4,20 @@ Scrapes all images/webms from a 4chan thread.
 TODO: Add support for more image boards.
 """
 import os
-from urllib.parse import urlparse
 import re
+from urllib.parse import urlparse

-import requests
 import bs4
+import requests

 HEADERS = {'User-Agent': 'bix nood gibs me dem maymays'}

-def defaultOutput(thread_url, soup):
-	"""
-	Parses the thread title, url and subject field (if present) to create
-	a suitable directory name to save files to.
-	Format: [board]-[thread no.]-[subject field]
-	"""
-	# TODO: remove the need for thread_url to be passed
-	output = thread_url.split('/')[3] + '-' + os.path.split(thread_url)[1]
-	if soup.find(class_ = 'subject'):
-		output = output + '-' + soup.find(class_ = 'subject').text
-	output = output.replace("/","")
-	output = output.replace("...","")
-	return output

-
-def extract4chanFiles(soup):
-	"""
-	Extracts all file urls from the provided html-soup object and returns
-	a list on urls. 4chan only.
-	"""
-	imgPosts = soup.find_all(class_ = "fileText")
-	urls = []
-	for post in imgPosts:
-		url = post.find('a').get('href')
-		fname = post.find('a').get('title')
-		if not fname:
-			fname = post.find('a').text
-		urls.append((url, fname))
-	return urls
-
-
-def extractFoolFuukaFiles(soup):
-	"""
-	Extracts all file urls from the provided html-soup object and returns
-	a list on urls. FoolFuuka only.
-	"""
-	imgPosts = soup.find_all("a", {"class": "btnr parent",
-		"download": re.compile(r'.*')})
-	urls = []
-	for post in imgPosts:
-		url = post.get("href")
-		if not urlparse(url).scheme:
-			url = "http:" + url
-		fname = post.get("download")
-		urls.append((url, fname))
-	return urls
-
-
-def Scrape(thread_url, output=None, path=os.getcwd(), original_fname=False, combo=False):
+def Scrape(thread_url, output=None, original_filename=False, combo=False):
 	"""
 	Downloads thread page, extracts file urls and saves them to a directory.
 	"""
-	boards = {"boards.4chan.org": extract4chanFiles,
-		"boards.fireden.net": extractFoolFuukaFiles,
-		"archive.4plebs.org": extractFoolFuukaFiles}
-	domain = urlparse(thread_url).netloc
-	if domain not in boards:
-		print("Unknown URL. Exiting.")
-		return
+	thread_url = re.sub(r"/?#.*$", "", thread_url)

 	s = requests.Session()
 	s.headers.update(HEADERS)
@ -81,18 +28,37 @@ def Scrape(thread_url, output=None, path=os.getcwd(), original_fname=False, comb
 	soup = bs4.BeautifulSoup(res.text, "html.parser")

 	if output == None:
-		output = defaultOutput(thread_url, soup)
-	directory = os.path.join(path, output)
-	os.makedirs(directory, exist_ok=True)
-	existingFiles = os.listdir(directory)
-
-	urls = boards[domain](soup)
-	if urls == None:
-		print("Unknown URL. Exiting.")
-		return
+		output = os.getcwd()
+		dirName = [urlparse(thread_url).path.split("/")[1]]
+		dirName += [os.path.split(thread_url)[1]]
+		subject = soup.find(class_="subject")
+		if subject:
+			subject = subject.text
+		else:
+			subject = soup.find(class_="post_title")
+			if subject:
+				subject = subject.text
+			else:
+				subject = ""
+		dirName += [subject]
+		dirName = "-".join(dirName)
+		output = os.path.join(output, dirName)

 	print("Saving to: " + output)
-	for imgUrl, fname in urls:
+	os.makedirs(output, exist_ok=True)
+	existingFiles = os.listdir(output)
+
+	imgPosts = soup.find_all(class_ = "fileText")
+	if not imgPosts:
+		imgPosts = soup.find_all("a",
+			{"class": "btnr parent", "download": re.compile(r'.*')})
+
+	for post in imgPosts:
+		try:
+			imgUrl = post.find('a').get('href')
+		except AttributeError:
+			imgUrl = post.get("href")
+
 		if not imgUrl:
 			print("File Deleted")
 			continue
@ -110,8 +76,12 @@ def Scrape(thread_url, output=None, path=os.getcwd(), original_fname=False, comb
 			continue

 		unix = os.path.basename(imgUrl)
-		if original_fname:
-			if os.path.exists(os.path.join(directory, fname)):
+		try:
+			fname = post.find('a').get('title')
+		except AttributeError:
+			fname = post.get("download")
+		if original_filename:
+			if os.path.exists(os.path.join(output, fname)):
 				print("Filename collision")
 				fname = os.path.splitext(fname)[0]
 				fname += "_" + unix
@ -120,7 +90,7 @@ def Scrape(thread_url, output=None, path=os.getcwd(), original_fname=False, comb
 		else:
 			fname = unix
 		# save the image
-		with open(os.path.join(directory, fname), 'wb') as imgFile:
+		with open(os.path.join(output, fname), 'wb') as imgFile:
 			for chunk in res.iter_content(100000):
 				imgFile.write(chunk)

@ -136,28 +106,19 @@ if __name__ == "__main__":
 	parser.add_argument(
 		"-o",
 		"--output",
-		default=None,
-		help="Title of directory to save images into. Default is \
-		[board]-[thread no.]-[subject field].")
-	parser.add_argument(
-		"-p",
-		"--path",
-		default=os.getcwd(),
-		help="Full path to the base directory to save gallery into. Default \
-		is current directory.")
+		help="The directory to save images into. Default is " \
+		+ "./[board]-[thread no.]-[subject field].")
 	parser.add_argument(
 		"-f",
-		"--orginal-filename",
-		dest="original_fname",
+		"--original-filename",
 		action="store_true",
-		default=False,
 		help="Saves the files using the original filename.")
 	parser.add_argument(
 		"-c",
 		"--combo",
 		action="store_true",
-		help="Saves the files using a combination of the 4chan filename and \
-		the original filename.")
+		help="Saves the files using a combination of the 4chan filename and " \
+		+ "the original filename.")
 	args = parser.parse_args()

-	Scrape(**vars(args))
+	Scrape(args.thread_url, args.output, args.original_filename, args.combo)