rewrote most of it
This commit is contained in:
parent
63aca6bb04
commit
a98d839bb6
132
4chanScrape.py
132
4chanScrape.py
|
@ -1,76 +1,22 @@
|
||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
"""
|
"""
|
||||||
Scrapes all images/webms from a 4chan thread.
|
Scrapes all images/webms from a 4chan thread.
|
||||||
TODO: Add support for more image boards.
|
|
||||||
"""
|
"""
|
||||||
import os
|
import os
|
||||||
from urllib.parse import urlparse
|
|
||||||
import re
|
import re
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
import requests
|
|
||||||
import bs4
|
import bs4
|
||||||
|
import requests
|
||||||
|
|
||||||
HEADERS = {'User-Agent': 'bix nood gibs me dem maymays'}
|
HEADERS = {'User-Agent': 'bix nood gibs me dem maymays'}
|
||||||
|
|
||||||
def defaultOutput(thread_url, soup):
|
|
||||||
"""
|
|
||||||
Parses the thread title, url and subject field (if present) to create
|
|
||||||
a suitable directory name to save files to.
|
|
||||||
Format: [board]-[thread no.]-[subject field]
|
|
||||||
"""
|
|
||||||
# TODO: remove the need for thread_url to be passed
|
|
||||||
output = thread_url.split('/')[3] + '-' + os.path.split(thread_url)[1]
|
|
||||||
if soup.find(class_ = 'subject'):
|
|
||||||
output = output + '-' + soup.find(class_ = 'subject').text
|
|
||||||
output = output.replace("/","")
|
|
||||||
output = output.replace("...","")
|
|
||||||
return output
|
|
||||||
|
|
||||||
|
def Scrape(thread_url, output=None, original_filename=False, combo=False):
|
||||||
def extract4chanFiles(soup):
|
|
||||||
"""
|
|
||||||
Extracts all file urls from the provided html-soup object and returns
|
|
||||||
a list on urls. 4chan only.
|
|
||||||
"""
|
|
||||||
imgPosts = soup.find_all(class_ = "fileText")
|
|
||||||
urls = []
|
|
||||||
for post in imgPosts:
|
|
||||||
url = post.find('a').get('href')
|
|
||||||
fname = post.find('a').get('title')
|
|
||||||
if not fname:
|
|
||||||
fname = post.find('a').text
|
|
||||||
urls.append((url, fname))
|
|
||||||
return urls
|
|
||||||
|
|
||||||
|
|
||||||
def extractFoolFuukaFiles(soup):
|
|
||||||
"""
|
|
||||||
Extracts all file urls from the provided html-soup object and returns
|
|
||||||
a list on urls. FoolFuuka only.
|
|
||||||
"""
|
|
||||||
imgPosts = soup.find_all("a", {"class": "btnr parent",
|
|
||||||
"download": re.compile(r'.*')})
|
|
||||||
urls = []
|
|
||||||
for post in imgPosts:
|
|
||||||
url = post.get("href")
|
|
||||||
if not urlparse(url).scheme:
|
|
||||||
url = "http:" + url
|
|
||||||
fname = post.get("download")
|
|
||||||
urls.append((url, fname))
|
|
||||||
return urls
|
|
||||||
|
|
||||||
|
|
||||||
def Scrape(thread_url, output=None, path=os.getcwd(), original_fname=False, combo=False):
|
|
||||||
"""
|
"""
|
||||||
Downloads thread page, extracts file urls and saves them to a directory.
|
Downloads thread page, extracts file urls and saves them to a directory.
|
||||||
"""
|
"""
|
||||||
boards = {"boards.4chan.org": extract4chanFiles,
|
thread_url = re.sub(r"/?#.*$", "", thread_url)
|
||||||
"boards.fireden.net": extractFoolFuukaFiles,
|
|
||||||
"archive.4plebs.org": extractFoolFuukaFiles}
|
|
||||||
domain = urlparse(thread_url).netloc
|
|
||||||
if domain not in boards:
|
|
||||||
print("Unknown URL. Exiting.")
|
|
||||||
return
|
|
||||||
|
|
||||||
s = requests.Session()
|
s = requests.Session()
|
||||||
s.headers.update(HEADERS)
|
s.headers.update(HEADERS)
|
||||||
|
@ -81,18 +27,37 @@ def Scrape(thread_url, output=None, path=os.getcwd(), original_fname=False, comb
|
||||||
soup = bs4.BeautifulSoup(res.text, "html.parser")
|
soup = bs4.BeautifulSoup(res.text, "html.parser")
|
||||||
|
|
||||||
if output == None:
|
if output == None:
|
||||||
output = defaultOutput(thread_url, soup)
|
output = os.getcwd()
|
||||||
directory = os.path.join(path, output)
|
dirName = [urlparse(thread_url).path.split("/")[1]]
|
||||||
os.makedirs(directory, exist_ok=True)
|
dirName += [os.path.split(thread_url)[1]]
|
||||||
existingFiles = os.listdir(directory)
|
subject = soup.find(class_="subject")
|
||||||
|
if subject:
|
||||||
urls = boards[domain](soup)
|
subject = subject.text
|
||||||
if urls == None:
|
else:
|
||||||
print("Unknown URL. Exiting.")
|
subject = soup.find(class_="post_title")
|
||||||
return
|
if subject:
|
||||||
|
subject = subject.text
|
||||||
|
else:
|
||||||
|
subject = ""
|
||||||
|
dirName += [subject]
|
||||||
|
dirName = "-".join(dirName)
|
||||||
|
output = os.path.join(output, dirName)
|
||||||
|
|
||||||
print("Saving to: " + output)
|
print("Saving to: " + output)
|
||||||
for imgUrl, fname in urls:
|
os.makedirs(output, exist_ok=True)
|
||||||
|
existingFiles = os.listdir(output)
|
||||||
|
|
||||||
|
imgPosts = soup.find_all(class_ = "fileText")
|
||||||
|
if not imgPosts:
|
||||||
|
imgPosts = soup.find_all("a",
|
||||||
|
{"class": "btnr parent", "download": re.compile(r'.*')})
|
||||||
|
|
||||||
|
for post in imgPosts:
|
||||||
|
try:
|
||||||
|
imgUrl = post.find('a').get('href')
|
||||||
|
except AttributeError:
|
||||||
|
imgUrl = post.get("href")
|
||||||
|
|
||||||
if not imgUrl:
|
if not imgUrl:
|
||||||
print("File Deleted")
|
print("File Deleted")
|
||||||
continue
|
continue
|
||||||
|
@ -110,8 +75,12 @@ def Scrape(thread_url, output=None, path=os.getcwd(), original_fname=False, comb
|
||||||
continue
|
continue
|
||||||
|
|
||||||
unix = os.path.basename(imgUrl)
|
unix = os.path.basename(imgUrl)
|
||||||
if original_fname:
|
try:
|
||||||
if os.path.exists(os.path.join(directory, fname)):
|
fname = post.find('a').get('title')
|
||||||
|
except AttributeError:
|
||||||
|
fname = post.get("download")
|
||||||
|
if original_filename:
|
||||||
|
if os.path.exists(os.path.join(output, fname)):
|
||||||
print("Filename collision")
|
print("Filename collision")
|
||||||
fname = os.path.splitext(fname)[0]
|
fname = os.path.splitext(fname)[0]
|
||||||
fname += "_" + unix
|
fname += "_" + unix
|
||||||
|
@ -120,7 +89,7 @@ def Scrape(thread_url, output=None, path=os.getcwd(), original_fname=False, comb
|
||||||
else:
|
else:
|
||||||
fname = unix
|
fname = unix
|
||||||
# save the image
|
# save the image
|
||||||
with open(os.path.join(directory, fname), 'wb') as imgFile:
|
with open(os.path.join(output, fname), 'wb') as imgFile:
|
||||||
for chunk in res.iter_content(100000):
|
for chunk in res.iter_content(100000):
|
||||||
imgFile.write(chunk)
|
imgFile.write(chunk)
|
||||||
|
|
||||||
|
@ -136,28 +105,19 @@ if __name__ == "__main__":
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"-o",
|
"-o",
|
||||||
"--output",
|
"--output",
|
||||||
default=None,
|
help="The directory to save images into. Default is " \
|
||||||
help="Title of directory to save images into. Default is \
|
+ "./[board]-[thread no.]-[subject field].")
|
||||||
[board]-[thread no.]-[subject field].")
|
|
||||||
parser.add_argument(
|
|
||||||
"-p",
|
|
||||||
"--path",
|
|
||||||
default=os.getcwd(),
|
|
||||||
help="Full path to the base directory to save gallery into. Default \
|
|
||||||
is current directory.")
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"-f",
|
"-f",
|
||||||
"--orginal-filename",
|
"--original-filename",
|
||||||
dest="original_fname",
|
|
||||||
action="store_true",
|
action="store_true",
|
||||||
default=False,
|
|
||||||
help="Saves the files using the original filename.")
|
help="Saves the files using the original filename.")
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"-c",
|
"-c",
|
||||||
"--combo",
|
"--combo",
|
||||||
action="store_true",
|
action="store_true",
|
||||||
help="Saves the files using a combination of the 4chan filename and \
|
help="Saves the files using a combination of the 4chan filename and " \
|
||||||
the original filename.")
|
+ "the original filename.")
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
Scrape(**vars(args))
|
Scrape(args.thread_url, args.output, args.original_filename, args.combo)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user