Skip to content

Commit

Permalink
Merge pull request #15 from balbi-uff/feature/workable_by_terminal
Browse files Browse the repository at this point in the history
Feature/workable by terminal
  • Loading branch information
balbi-uff authored Jan 3, 2023
2 parents 858bd4d + 1dcbf63 commit ab2b4a5
Show file tree
Hide file tree
Showing 8 changed files with 1,501 additions and 31 deletions.
1 change: 0 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
**-workspace
*.pytest**
**/test_cases/*
**/.idea/*
6 changes: 0 additions & 6 deletions 4chanScrapper.py

This file was deleted.

136 changes: 136 additions & 0 deletions FchanScrapper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
import sys

from scrapper.scrapper_methods import download_files_from_thread

MINIMUM_ARGUMENTS_THRESHOLD = 1


def get_arguments_from_command_line():
"""
Gets arguments from command line and returns them as a list.
"""
return sys.argv


def is_manual_mode_trigger(argument):
return argument.startswith("-m")


def has_minimum_argument_thresold(arguments):
return len(arguments) >= MINIMUM_ARGUMENTS_THRESHOLD


def manual_resolution_setting_simple(arguments):
"""
Checks if resolution is set in manual mode - simple.
"""
return "--resolution" in arguments


def manual_resolution_setting_full_config(arguments):
"""
Checks if resolution is set in manual mode - full config and not in simple.
"""
return ("--max-res" in arguments or "--min-res" in arguments) and not manual_resolution_setting_simple(arguments)


def get_resolution_from_arguments_standart(res_arguments, resolution_trigger_str):
"""
Abstract method for getting resolution from arguments.
"""
resolution_trigger_index = res_arguments.index(resolution_trigger_str)
x_resolution = int(res_arguments[resolution_trigger_index + 1])
y_resolution = int(res_arguments[resolution_trigger_index + 2])
return x_resolution, y_resolution


def get_resolution_from_arguments_simple(res_arguments):
"""
Gets resolution from arguments.
"""
return get_resolution_from_arguments_standart(res_arguments, "--resolution")


def define_resolution_full_config(res_args, resolution_trigger_str):
# Obtain minimum resolution
if resolution_trigger_str in res_args:
x, y = get_resolution_from_arguments_standart(res_args, resolution_trigger_str)
else:
x, y = None, None
return x, y


def get_resolution_from_arguments_full_config(res_arguments):
"""
Gets resolution from arguments.
"""

# Obtain minimum resolution
min_x_res, min_y_res = define_resolution_full_config(res_arguments, "--min-res")

# Obtain maximum resolution
max_x_res, max_y_res = define_resolution_full_config(res_arguments, "--max-res")

return min_x_res, min_y_res, max_x_res, max_y_res


def manual_mode_download(arguments):
"""
Manual mode is triggered when -m is passed as argument.
Expected arguments:
# Full command example:
$ python FchanScrapper.py -m thread_link download_path --resolution 1920 1080
[0] [1] [2] [3] [4] [5] [6]
$ python FchanScrapper.py -m thread_link download_path --min-res 1920 1080 --max-res 1920 1080
[0] [1] [2] [3] [4] [5] [6] [7] [8] [9]
"""

manual_mode_arguments = arguments[2:]
if manual_resolution_setting_simple(manual_mode_arguments):
x_resolution, y_resolution = get_resolution_from_arguments_simple(manual_mode_arguments)
print("Resolution limits set to: " + str(x_resolution) + "x" + str(y_resolution))

return download_files_from_thread(manual_mode_arguments[0], manual_mode_arguments[1],
min_x_res=x_resolution, min_y_res=y_resolution,
max_x_res=None, max_y_res=None
)
if manual_resolution_setting_full_config(manual_mode_arguments):
[min_x, min_y, max_x, max_y] = get_resolution_from_arguments_full_config(manual_mode_arguments)
print("minimum accepted resolution: " + str(min_x) + "x" + str(min_y))
print("maximum accepted resolution: " + str(max_x) + "x" + str(max_y))

return download_files_from_thread(manual_mode_arguments[0], manual_mode_arguments[1],
min_x_res=min_x, min_y_res=min_y,
max_x_res=max_x, max_y_res=max_y
)
else:
raise Exception("Resolution not set. Please set resolution with --resolution <resolution-x> <resolution-y> or "
"type -h for help.")


if __name__ == '__main__':
command_line_arguments = get_arguments_from_command_line()
try:
if is_manual_mode_trigger(command_line_arguments[1]):
manual_mode_download(command_line_arguments)
elif has_minimum_argument_thresold(command_line_arguments):
thread_link = command_line_arguments[1]

if len(command_line_arguments) == 2:
download_path = None
else:
download_path = command_line_arguments[2]
download_files_from_thread(thread_link, download_path)
else:
raise Exception("Not enough arguments.")
sys.exit(0)
except IndexError as e:
print("Please insert arguments correctly while on manual mode.")

except Exception as e:
print(e)

sys.exit(1)
105 changes: 90 additions & 15 deletions scrapper/scrapper_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,32 +56,86 @@ def get_filename(link):
print(f"{number_of_ended_tasks} of {number_of_links_to_download} downloaded!")


async def download_image_task(img_link): # WATCH MY ASS
async def download_image_task(img_link):
"""
Function responsible for creating the task of downloading a single image.
Creates the task of downloading a single image.
Args:
img_link: Link of the image, the standart is a internal link without the protocol included.
Returns:
"""
link = HTTP_PROTOCOL_SYMBOL + ":" + img_link
print(f"Downloading {link}")
download(link)


def get_thread_data_from_web(thread_link):
return requests.get(thread_link).content


def get_resolution_from_div_text(text):
"""
Gets resolution from div text.
Used "File: filename.jpg(491 KB, 1164x705)" as example.
Args:
text: Text from div.
Returns: Resolution as tuple.
"""
resolution = text[:-1].strip().split(",")[-1]
x_res, y_res = resolution.split("x")
return int(x_res), int(y_res)


def get_file_links_from_thread_divs(list_of_divs, **filters):
def get_div_link(image_raw_div):
def check_minimum_resolution(img_x, img_y, min_x, min_y):
if min_x:
x_validation = img_x >= min_x
else:
x_validation = True
if min_y:
y_validation = img_y >= min_y
else:
y_validation = True

return x_validation and y_validation

def check_maximum_resolution(img_x, img_y, max_x, max_y):
if max_x:
x_validation = img_x <= max_x
else:
x_validation = True
if max_y:
y_validation = img_y <= max_y
else:
y_validation = True

return x_validation and y_validation

def check_resolution(img_x, img_y, min_x, min_y, max_x, max_y):
# VALIDATE AS NONE IS ENTERING MIN AND MAX VARS
return check_minimum_resolution(img_x, img_y, min_x, min_y) and check_maximum_resolution(img_x, img_y, max_x,
max_y)

def get_div_link(image_raw_div, **filters):
div_soup = make_soup(image_raw_div, SELECTED_PARSER)

return div_soup.find('a')['href']
div_link = div_soup.find('a')['href']
if filters.values():
x_resolution, y_resolution = get_resolution_from_div_text(div_soup.text)
if not check_resolution(x_resolution, y_resolution, *filters.values()):
div_link = None
return div_link

# filter this after primary tests
file_links = [get_div_link(div_html) for div_html in list_of_divs]
file_links = [get_div_link(div_html, **filters) for div_html in list_of_divs]
#
return file_links

return [link for link in file_links if type(link) is str]

def get_all_divs_with_classname(soup, classname):
return list(map(str, soup.findAll('div', class_=classname)))

def get_all_divs_with_class_name(soup, class_name):
return list(map(str, soup.findAll('div', class_=class_name)))


def get_thread_name(soup):
Expand All @@ -92,7 +146,6 @@ def get_thread_name(soup):
return soup.find('span', class_=THREAD_CLASS_STD_NAME).text



async def download_tasks(download_path, links_from_threads_files):
# Organizing download tasks
thread_download_tasks = []
Expand All @@ -106,9 +159,9 @@ async def download_tasks(download_path, links_from_threads_files):
await asyncio.gather(*thread_download_tasks)


def get_local_html_file_string(file_path):
def get_html_file_as_string(file_path):
"""
Gets html file as string.
Returns html content as string.
Args:
file_path: File path.
Expand All @@ -119,10 +172,25 @@ def get_local_html_file_string(file_path):


def get_thread_data(thread_link):
if "https://boards.4chan.org" in thread_link:
if "boards.4chan.org" in thread_link:
return get_thread_data_from_web(thread_link)
else:
return get_local_html_file_string(thread_link)
return get_html_file_as_string(thread_link)


def has_empty_arguments(t, d):
return '' in [t, d]


def create_local_download_path(new_dir_name):
"""
Creates a local download path.
Args:
new_dir_name: Thread name for new dir.
Returns: Path as string.
"""
return f"{STD_PATH_DOWNLOAD}/{new_dir_name}"


def download_files_from_thread(thread_link, download_path, **filters):
Expand All @@ -137,10 +205,17 @@ def download_files_from_thread(thread_link, download_path, **filters):
"""
global number_of_links_to_download

# Analyse arguments
if has_empty_arguments(thread_link, download_path):
raise Exception("Invalid number of arguments. Please, pass at least 2 arguments")

# Gathering files links
thread_html = get_thread_data(thread_link)
thread_soup = make_soup(thread_html, SELECTED_PARSER)
list_of_thread_files_divs_html = get_all_divs_with_classname(thread_soup, DIV_CLASS_STD_NAME)
thread_name_ = get_thread_name(thread_soup)
if not download_path:
create_local_download_path(thread_name_)
list_of_thread_files_divs_html = get_all_divs_with_class_name(thread_soup, DIV_CLASS_STD_NAME)
links_from_threads_files = get_file_links_from_thread_divs(list_of_thread_files_divs_html, **filters)
number_of_links_to_download = len(links_from_threads_files)

Expand Down
34 changes: 29 additions & 5 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1,31 +1,55 @@
import shutil

import pytest
import sys, os
from shutil import rmtree

projects_test_directory = sys.path[0]
projects_root = projects_test_directory + "/.."
temp_dir_name = "$temp"
temp_dir_path = projects_test_directory + "/test_cases/" + temp_dir_name


def remove_recursive(path):
if os.path.isdir(path) and not os.path.islink(path):
shutil.rmtree(path)
elif os.path.exists(path):
os.remove(path)


@pytest.fixture()
def setup_and_teardown_at_temp_dir():
# setup
os.chdir(projects_test_directory + "/test_cases")
os.mkdir(temp_dir_name)
os.chdir(temp_dir_name)
os.mkdir(temp_dir_path)
os.chdir(temp_dir_path)

# test
yield

# teardown
os.chdir("..")
rmtree(f"{temp_dir_name}")
remove_recursive(f"{temp_dir_path}")


@pytest.fixture()
def setup_at_battleship_thread_dir():
def setup_for_thread_name_at_battleship_thread_dir():
# setup
os.chdir(projects_test_directory + "/test_cases/battleship")

# test
yield


@pytest.fixture()
def setup_and_teardown_at_battleship_thread_dir():
# setup
os.chdir(projects_root)
# create temp directory
os.mkdir(temp_dir_path)

# test
yield

# teardown
os.chdir("..")
remove_recursive(temp_dir_path)
1,022 changes: 1,022 additions & 0 deletions tests/test_cases/avatar/avatar.html

Large diffs are not rendered by default.

10 changes: 10 additions & 0 deletions tests/test_cases/battleship/battleship.html

Large diffs are not rendered by default.

Loading

0 comments on commit ab2b4a5

Please sign in to comment.