From 11e88d0426927c9403dfe204aed8310c66e1dd84 Mon Sep 17 00:00:00 2001 From: iou1name Date: Wed, 21 Mar 2018 23:23:26 -0400 Subject: [PATCH] rewrote most of it --- 4chanScrape.py | 131 +++++++++++++++++-------------------------------- 1 file changed, 46 insertions(+), 85 deletions(-) diff --git a/4chanScrape.py b/4chanScrape.py index 28c41d8..e8b1e07 100755 --- a/4chanScrape.py +++ b/4chanScrape.py @@ -4,73 +4,20 @@ Scrapes all images/webms from a 4chan thread. TODO: Add support for more image boards. """ import os -from urllib.parse import urlparse import re +from urllib.parse import urlparse -import requests import bs4 +import requests HEADERS = {'User-Agent': 'bix nood gibs me dem maymays'} -def defaultOutput(thread_url, soup): - """ - Parses the thread title, url and subject field (if present) to create - a suitable directory name to save files to. - Format: [board]-[thread no.]-[subject field] - """ - # TODO: remove the need for thread_url to be passed - output = thread_url.split('/')[3] + '-' + os.path.split(thread_url)[1] - if soup.find(class_ = 'subject'): - output = output + '-' + soup.find(class_ = 'subject').text - output = output.replace("/","") - output = output.replace("...","") - return output - -def extract4chanFiles(soup): - """ - Extracts all file urls from the provided html-soup object and returns - a list on urls. 4chan only. - """ - imgPosts = soup.find_all(class_ = "fileText") - urls = [] - for post in imgPosts: - url = post.find('a').get('href') - fname = post.find('a').get('title') - if not fname: - fname = post.find('a').text - urls.append((url, fname)) - return urls - - -def extractFoolFuukaFiles(soup): - """ - Extracts all file urls from the provided html-soup object and returns - a list on urls. FoolFuuka only. - """ - imgPosts = soup.find_all("a", {"class": "btnr parent", - "download": re.compile(r'.*')}) - urls = [] - for post in imgPosts: - url = post.get("href") - if not urlparse(url).scheme: - url = "http:" + url - fname = post.get("download") - urls.append((url, fname)) - return urls - - -def Scrape(thread_url, output=None, path=os.getcwd(), original_fname=False, combo=False): +def Scrape(thread_url, output=None, original_filename=False, combo=False): """ Downloads thread page, extracts file urls and saves them to a directory. """ - boards = {"boards.4chan.org": extract4chanFiles, - "boards.fireden.net": extractFoolFuukaFiles, - "archive.4plebs.org": extractFoolFuukaFiles} - domain = urlparse(thread_url).netloc - if domain not in boards: - print("Unknown URL. Exiting.") - return + thread_url = re.sub(r"/?#.*$", "", thread_url) s = requests.Session() s.headers.update(HEADERS) @@ -81,18 +28,37 @@ def Scrape(thread_url, output=None, path=os.getcwd(), original_fname=False, comb soup = bs4.BeautifulSoup(res.text, "html.parser") if output == None: - output = defaultOutput(thread_url, soup) - directory = os.path.join(path, output) - os.makedirs(directory, exist_ok=True) - existingFiles = os.listdir(directory) - - urls = boards[domain](soup) - if urls == None: - print("Unknown URL. Exiting.") - return + output = os.getcwd() + dirName = [urlparse(thread_url).path.split("/")[1]] + dirName += [os.path.split(thread_url)[1]] + subject = soup.find(class_="subject") + if subject: + subject = subject.text + else: + subject = soup.find(class_="post_title") + if subject: + subject = subject.text + else: + subject = "" + dirName += [subject] + dirName = "-".join(dirName) + output = os.path.join(output, dirName) print("Saving to: " + output) - for imgUrl, fname in urls: + os.makedirs(output, exist_ok=True) + existingFiles = os.listdir(output) + + imgPosts = soup.find_all(class_ = "fileText") + if not imgPosts: + imgPosts = soup.find_all("a", + {"class": "btnr parent", "download": re.compile(r'.*')}) + + for post in imgPosts: + try: + imgUrl = post.find('a').get('href') + except AttributeError: + imgUrl = post.get("href") + if not imgUrl: print("File Deleted") continue @@ -110,8 +76,12 @@ def Scrape(thread_url, output=None, path=os.getcwd(), original_fname=False, comb continue unix = os.path.basename(imgUrl) - if original_fname: - if os.path.exists(os.path.join(directory, fname)): + try: + fname = post.find('a').get('title') + except AttributeError: + fname = post.get("download") + if original_filename: + if os.path.exists(os.path.join(output, fname)): print("Filename collision") fname = os.path.splitext(fname)[0] fname += "_" + unix @@ -120,7 +90,7 @@ def Scrape(thread_url, output=None, path=os.getcwd(), original_fname=False, comb else: fname = unix # save the image - with open(os.path.join(directory, fname), 'wb') as imgFile: + with open(os.path.join(output, fname), 'wb') as imgFile: for chunk in res.iter_content(100000): imgFile.write(chunk) @@ -136,28 +106,19 @@ if __name__ == "__main__": parser.add_argument( "-o", "--output", - default=None, - help="Title of directory to save images into. Default is \ - [board]-[thread no.]-[subject field].") - parser.add_argument( - "-p", - "--path", - default=os.getcwd(), - help="Full path to the base directory to save gallery into. Default \ - is current directory.") + help="The directory to save images into. Default is " \ + + "./[board]-[thread no.]-[subject field].") parser.add_argument( "-f", - "--orginal-filename", - dest="original_fname", + "--original-filename", action="store_true", - default=False, help="Saves the files using the original filename.") parser.add_argument( "-c", "--combo", action="store_true", - help="Saves the files using a combination of the 4chan filename and \ - the original filename.") + help="Saves the files using a combination of the 4chan filename and " \ + + "the original filename.") args = parser.parse_args() - Scrape(**vars(args)) + Scrape(args.thread_url, args.output, args.original_filename, args.combo)