first commit
This commit is contained in:
commit
63aca6bb04
163
4chanScrape.py
Executable file
163
4chanScrape.py
Executable file
|
@ -0,0 +1,163 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Scrapes all images/webms from a 4chan thread.
|
||||
TODO: Add support for more image boards.
|
||||
"""
|
||||
import os
|
||||
from urllib.parse import urlparse
|
||||
import re
|
||||
|
||||
import requests
|
||||
import bs4
|
||||
|
||||
HEADERS = {'User-Agent': 'bix nood gibs me dem maymays'}
|
||||
|
||||
def defaultOutput(thread_url, soup):
|
||||
"""
|
||||
Parses the thread title, url and subject field (if present) to create
|
||||
a suitable directory name to save files to.
|
||||
Format: [board]-[thread no.]-[subject field]
|
||||
"""
|
||||
# TODO: remove the need for thread_url to be passed
|
||||
output = thread_url.split('/')[3] + '-' + os.path.split(thread_url)[1]
|
||||
if soup.find(class_ = 'subject'):
|
||||
output = output + '-' + soup.find(class_ = 'subject').text
|
||||
output = output.replace("/","")
|
||||
output = output.replace("...","")
|
||||
return output
|
||||
|
||||
|
||||
def extract4chanFiles(soup):
|
||||
"""
|
||||
Extracts all file urls from the provided html-soup object and returns
|
||||
a list on urls. 4chan only.
|
||||
"""
|
||||
imgPosts = soup.find_all(class_ = "fileText")
|
||||
urls = []
|
||||
for post in imgPosts:
|
||||
url = post.find('a').get('href')
|
||||
fname = post.find('a').get('title')
|
||||
if not fname:
|
||||
fname = post.find('a').text
|
||||
urls.append((url, fname))
|
||||
return urls
|
||||
|
||||
|
||||
def extractFoolFuukaFiles(soup):
|
||||
"""
|
||||
Extracts all file urls from the provided html-soup object and returns
|
||||
a list on urls. FoolFuuka only.
|
||||
"""
|
||||
imgPosts = soup.find_all("a", {"class": "btnr parent",
|
||||
"download": re.compile(r'.*')})
|
||||
urls = []
|
||||
for post in imgPosts:
|
||||
url = post.get("href")
|
||||
if not urlparse(url).scheme:
|
||||
url = "http:" + url
|
||||
fname = post.get("download")
|
||||
urls.append((url, fname))
|
||||
return urls
|
||||
|
||||
|
||||
def Scrape(thread_url, output=None, path=os.getcwd(), original_fname=False, combo=False):
|
||||
"""
|
||||
Downloads thread page, extracts file urls and saves them to a directory.
|
||||
"""
|
||||
boards = {"boards.4chan.org": extract4chanFiles,
|
||||
"boards.fireden.net": extractFoolFuukaFiles,
|
||||
"archive.4plebs.org": extractFoolFuukaFiles}
|
||||
domain = urlparse(thread_url).netloc
|
||||
if domain not in boards:
|
||||
print("Unknown URL. Exiting.")
|
||||
return
|
||||
|
||||
s = requests.Session()
|
||||
s.headers.update(HEADERS)
|
||||
|
||||
print('Downloading page: ' + thread_url)
|
||||
res = s.get(thread_url, verify=True)
|
||||
res.raise_for_status()
|
||||
soup = bs4.BeautifulSoup(res.text, "html.parser")
|
||||
|
||||
if output == None:
|
||||
output = defaultOutput(thread_url, soup)
|
||||
directory = os.path.join(path, output)
|
||||
os.makedirs(directory, exist_ok=True)
|
||||
existingFiles = os.listdir(directory)
|
||||
|
||||
urls = boards[domain](soup)
|
||||
if urls == None:
|
||||
print("Unknown URL. Exiting.")
|
||||
return
|
||||
|
||||
print("Saving to: " + output)
|
||||
for imgUrl, fname in urls:
|
||||
if not imgUrl:
|
||||
print("File Deleted")
|
||||
continue
|
||||
if imgUrl[0] == r'/':
|
||||
imgUrl = 'https:' + imgUrl
|
||||
|
||||
if os.path.basename(imgUrl) in existingFiles:
|
||||
print(os.path.basename(imgUrl) + " already present.")
|
||||
continue
|
||||
print("Downloading URL:", imgUrl)
|
||||
res = s.get(imgUrl, verify=True)
|
||||
|
||||
if res.status_code == 404:
|
||||
print("404: Not Found")
|
||||
continue
|
||||
|
||||
unix = os.path.basename(imgUrl)
|
||||
if original_fname:
|
||||
if os.path.exists(os.path.join(directory, fname)):
|
||||
print("Filename collision")
|
||||
fname = os.path.splitext(fname)[0]
|
||||
fname += "_" + unix
|
||||
elif combo:
|
||||
fname = os.path.splitext(unix)[0] + "_" + fname
|
||||
else:
|
||||
fname = unix
|
||||
# save the image
|
||||
with open(os.path.join(directory, fname), 'wb') as imgFile:
|
||||
for chunk in res.iter_content(100000):
|
||||
imgFile.write(chunk)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Downloads all images from the specified 4chan thread.")
|
||||
parser.add_argument(
|
||||
"thread_url",
|
||||
help="The url of the desired thread.")
|
||||
parser.add_argument(
|
||||
"-o",
|
||||
"--output",
|
||||
default=None,
|
||||
help="Title of directory to save images into. Default is \
|
||||
[board]-[thread no.]-[subject field].")
|
||||
parser.add_argument(
|
||||
"-p",
|
||||
"--path",
|
||||
default=os.getcwd(),
|
||||
help="Full path to the base directory to save gallery into. Default \
|
||||
is current directory.")
|
||||
parser.add_argument(
|
||||
"-f",
|
||||
"--orginal-filename",
|
||||
dest="original_fname",
|
||||
action="store_true",
|
||||
default=False,
|
||||
help="Saves the files using the original filename.")
|
||||
parser.add_argument(
|
||||
"-c",
|
||||
"--combo",
|
||||
action="store_true",
|
||||
help="Saves the files using a combination of the 4chan filename and \
|
||||
the original filename.")
|
||||
args = parser.parse_args()
|
||||
|
||||
Scrape(**vars(args))
|
Loading…
Reference in New Issue
Block a user