rewrote most of it

This commit is contained in:
iou1name 2018-03-21 23:23:26 -04:00
parent 63aca6bb04
commit a98d839bb6

View File

@ -1,76 +1,22 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
""" """
Scrapes all images/webms from a 4chan thread. Scrapes all images/webms from a 4chan thread.
TODO: Add support for more image boards.
""" """
import os import os
from urllib.parse import urlparse
import re import re
from urllib.parse import urlparse
import requests
import bs4 import bs4
import requests
HEADERS = {'User-Agent': 'bix nood gibs me dem maymays'} HEADERS = {'User-Agent': 'bix nood gibs me dem maymays'}
def defaultOutput(thread_url, soup):
"""
Parses the thread title, url and subject field (if present) to create
a suitable directory name to save files to.
Format: [board]-[thread no.]-[subject field]
"""
# TODO: remove the need for thread_url to be passed
output = thread_url.split('/')[3] + '-' + os.path.split(thread_url)[1]
if soup.find(class_ = 'subject'):
output = output + '-' + soup.find(class_ = 'subject').text
output = output.replace("/","")
output = output.replace("...","")
return output
def Scrape(thread_url, output=None, original_filename=False, combo=False):
def extract4chanFiles(soup):
"""
Extracts all file urls from the provided html-soup object and returns
a list on urls. 4chan only.
"""
imgPosts = soup.find_all(class_ = "fileText")
urls = []
for post in imgPosts:
url = post.find('a').get('href')
fname = post.find('a').get('title')
if not fname:
fname = post.find('a').text
urls.append((url, fname))
return urls
def extractFoolFuukaFiles(soup):
"""
Extracts all file urls from the provided html-soup object and returns
a list on urls. FoolFuuka only.
"""
imgPosts = soup.find_all("a", {"class": "btnr parent",
"download": re.compile(r'.*')})
urls = []
for post in imgPosts:
url = post.get("href")
if not urlparse(url).scheme:
url = "http:" + url
fname = post.get("download")
urls.append((url, fname))
return urls
def Scrape(thread_url, output=None, path=os.getcwd(), original_fname=False, combo=False):
""" """
Downloads thread page, extracts file urls and saves them to a directory. Downloads thread page, extracts file urls and saves them to a directory.
""" """
boards = {"boards.4chan.org": extract4chanFiles, thread_url = re.sub(r"/?#.*$", "", thread_url)
"boards.fireden.net": extractFoolFuukaFiles,
"archive.4plebs.org": extractFoolFuukaFiles}
domain = urlparse(thread_url).netloc
if domain not in boards:
print("Unknown URL. Exiting.")
return
s = requests.Session() s = requests.Session()
s.headers.update(HEADERS) s.headers.update(HEADERS)
@ -81,18 +27,37 @@ def Scrape(thread_url, output=None, path=os.getcwd(), original_fname=False, comb
soup = bs4.BeautifulSoup(res.text, "html.parser") soup = bs4.BeautifulSoup(res.text, "html.parser")
if output == None: if output == None:
output = defaultOutput(thread_url, soup) output = os.getcwd()
directory = os.path.join(path, output) dirName = [urlparse(thread_url).path.split("/")[1]]
os.makedirs(directory, exist_ok=True) dirName += [os.path.split(thread_url)[1]]
existingFiles = os.listdir(directory) subject = soup.find(class_="subject")
if subject:
urls = boards[domain](soup) subject = subject.text
if urls == None: else:
print("Unknown URL. Exiting.") subject = soup.find(class_="post_title")
return if subject:
subject = subject.text
else:
subject = ""
dirName += [subject]
dirName = "-".join(dirName)
output = os.path.join(output, dirName)
print("Saving to: " + output) print("Saving to: " + output)
for imgUrl, fname in urls: os.makedirs(output, exist_ok=True)
existingFiles = os.listdir(output)
imgPosts = soup.find_all(class_ = "fileText")
if not imgPosts:
imgPosts = soup.find_all("a",
{"class": "btnr parent", "download": re.compile(r'.*')})
for post in imgPosts:
try:
imgUrl = post.find('a').get('href')
except AttributeError:
imgUrl = post.get("href")
if not imgUrl: if not imgUrl:
print("File Deleted") print("File Deleted")
continue continue
@ -110,8 +75,12 @@ def Scrape(thread_url, output=None, path=os.getcwd(), original_fname=False, comb
continue continue
unix = os.path.basename(imgUrl) unix = os.path.basename(imgUrl)
if original_fname: try:
if os.path.exists(os.path.join(directory, fname)): fname = post.find('a').get('title')
except AttributeError:
fname = post.get("download")
if original_filename:
if os.path.exists(os.path.join(output, fname)):
print("Filename collision") print("Filename collision")
fname = os.path.splitext(fname)[0] fname = os.path.splitext(fname)[0]
fname += "_" + unix fname += "_" + unix
@ -120,7 +89,7 @@ def Scrape(thread_url, output=None, path=os.getcwd(), original_fname=False, comb
else: else:
fname = unix fname = unix
# save the image # save the image
with open(os.path.join(directory, fname), 'wb') as imgFile: with open(os.path.join(output, fname), 'wb') as imgFile:
for chunk in res.iter_content(100000): for chunk in res.iter_content(100000):
imgFile.write(chunk) imgFile.write(chunk)
@ -136,28 +105,19 @@ if __name__ == "__main__":
parser.add_argument( parser.add_argument(
"-o", "-o",
"--output", "--output",
default=None, help="The directory to save images into. Default is " \
help="Title of directory to save images into. Default is \ + "./[board]-[thread no.]-[subject field].")
[board]-[thread no.]-[subject field].")
parser.add_argument(
"-p",
"--path",
default=os.getcwd(),
help="Full path to the base directory to save gallery into. Default \
is current directory.")
parser.add_argument( parser.add_argument(
"-f", "-f",
"--orginal-filename", "--original-filename",
dest="original_fname",
action="store_true", action="store_true",
default=False,
help="Saves the files using the original filename.") help="Saves the files using the original filename.")
parser.add_argument( parser.add_argument(
"-c", "-c",
"--combo", "--combo",
action="store_true", action="store_true",
help="Saves the files using a combination of the 4chan filename and \ help="Saves the files using a combination of the 4chan filename and " \
the original filename.") + "the original filename.")
args = parser.parse_args() args = parser.parse_args()
Scrape(**vars(args)) Scrape(args.thread_url, args.output, args.original_filename, args.combo)