rewrote most of it

This commit is contained in:
iou1name 2018-03-21 23:23:26 -04:00
parent 63aca6bb04
commit a98d839bb6

View File

@ -1,76 +1,22 @@
#!/usr/bin/env python3
"""
Scrapes all images/webms from a 4chan thread.
TODO: Add support for more image boards.
"""
import os
from urllib.parse import urlparse
import re
from urllib.parse import urlparse
import requests
import bs4
import requests
HEADERS = {'User-Agent': 'bix nood gibs me dem maymays'}
def defaultOutput(thread_url, soup):
"""
Parses the thread title, url and subject field (if present) to create
a suitable directory name to save files to.
Format: [board]-[thread no.]-[subject field]
"""
# TODO: remove the need for thread_url to be passed
output = thread_url.split('/')[3] + '-' + os.path.split(thread_url)[1]
if soup.find(class_ = 'subject'):
output = output + '-' + soup.find(class_ = 'subject').text
output = output.replace("/","")
output = output.replace("...","")
return output
def extract4chanFiles(soup):
"""
Extracts all file urls from the provided html-soup object and returns
a list on urls. 4chan only.
"""
imgPosts = soup.find_all(class_ = "fileText")
urls = []
for post in imgPosts:
url = post.find('a').get('href')
fname = post.find('a').get('title')
if not fname:
fname = post.find('a').text
urls.append((url, fname))
return urls
def extractFoolFuukaFiles(soup):
"""
Extracts all file urls from the provided html-soup object and returns
a list on urls. FoolFuuka only.
"""
imgPosts = soup.find_all("a", {"class": "btnr parent",
"download": re.compile(r'.*')})
urls = []
for post in imgPosts:
url = post.get("href")
if not urlparse(url).scheme:
url = "http:" + url
fname = post.get("download")
urls.append((url, fname))
return urls
def Scrape(thread_url, output=None, path=os.getcwd(), original_fname=False, combo=False):
def Scrape(thread_url, output=None, original_filename=False, combo=False):
"""
Downloads thread page, extracts file urls and saves them to a directory.
"""
boards = {"boards.4chan.org": extract4chanFiles,
"boards.fireden.net": extractFoolFuukaFiles,
"archive.4plebs.org": extractFoolFuukaFiles}
domain = urlparse(thread_url).netloc
if domain not in boards:
print("Unknown URL. Exiting.")
return
thread_url = re.sub(r"/?#.*$", "", thread_url)
s = requests.Session()
s.headers.update(HEADERS)
@ -81,18 +27,37 @@ def Scrape(thread_url, output=None, path=os.getcwd(), original_fname=False, comb
soup = bs4.BeautifulSoup(res.text, "html.parser")
if output == None:
output = defaultOutput(thread_url, soup)
directory = os.path.join(path, output)
os.makedirs(directory, exist_ok=True)
existingFiles = os.listdir(directory)
urls = boards[domain](soup)
if urls == None:
print("Unknown URL. Exiting.")
return
output = os.getcwd()
dirName = [urlparse(thread_url).path.split("/")[1]]
dirName += [os.path.split(thread_url)[1]]
subject = soup.find(class_="subject")
if subject:
subject = subject.text
else:
subject = soup.find(class_="post_title")
if subject:
subject = subject.text
else:
subject = ""
dirName += [subject]
dirName = "-".join(dirName)
output = os.path.join(output, dirName)
print("Saving to: " + output)
for imgUrl, fname in urls:
os.makedirs(output, exist_ok=True)
existingFiles = os.listdir(output)
imgPosts = soup.find_all(class_ = "fileText")
if not imgPosts:
imgPosts = soup.find_all("a",
{"class": "btnr parent", "download": re.compile(r'.*')})
for post in imgPosts:
try:
imgUrl = post.find('a').get('href')
except AttributeError:
imgUrl = post.get("href")
if not imgUrl:
print("File Deleted")
continue
@ -110,8 +75,12 @@ def Scrape(thread_url, output=None, path=os.getcwd(), original_fname=False, comb
continue
unix = os.path.basename(imgUrl)
if original_fname:
if os.path.exists(os.path.join(directory, fname)):
try:
fname = post.find('a').get('title')
except AttributeError:
fname = post.get("download")
if original_filename:
if os.path.exists(os.path.join(output, fname)):
print("Filename collision")
fname = os.path.splitext(fname)[0]
fname += "_" + unix
@ -120,7 +89,7 @@ def Scrape(thread_url, output=None, path=os.getcwd(), original_fname=False, comb
else:
fname = unix
# save the image
with open(os.path.join(directory, fname), 'wb') as imgFile:
with open(os.path.join(output, fname), 'wb') as imgFile:
for chunk in res.iter_content(100000):
imgFile.write(chunk)
@ -136,28 +105,19 @@ if __name__ == "__main__":
parser.add_argument(
"-o",
"--output",
default=None,
help="Title of directory to save images into. Default is \
[board]-[thread no.]-[subject field].")
parser.add_argument(
"-p",
"--path",
default=os.getcwd(),
help="Full path to the base directory to save gallery into. Default \
is current directory.")
help="The directory to save images into. Default is " \
+ "./[board]-[thread no.]-[subject field].")
parser.add_argument(
"-f",
"--orginal-filename",
dest="original_fname",
"--original-filename",
action="store_true",
default=False,
help="Saves the files using the original filename.")
parser.add_argument(
"-c",
"--combo",
action="store_true",
help="Saves the files using a combination of the 4chan filename and \
the original filename.")
help="Saves the files using a combination of the 4chan filename and " \
+ "the original filename.")
args = parser.parse_args()
Scrape(**vars(args))
Scrape(args.thread_url, args.output, args.original_filename, args.combo)