Source code for alternat.collection.collector

import shutil
import os
import subprocess
import sys
import pathlib


[docs]class Collector: def __init__(self): """ """
[docs] def process( self, url: str, output_dir_path: str, download_recursive: bool = False, collect_using_apify: bool = False, ): """Collects image from the url into the output directory :param url: [description] :type url: str :param output_dir_path: [description] :type output_dir_path: str :param download_recursive: [description], defaults to False :type download_recursive: bool, optional :param collect_using_apify: [description], defaults to False :type collect_using_apify: bool, optional """ startURLArg = "start_urls=" + url try: os.environ["DOWNLOAD_URL"] = url os.environ["OUTPUT_FOLDER"] = output_dir_path if not os.path.exists(output_dir_path): os.makedirs(output_dir_path) if not collect_using_apify: print("No crawler explicitly mentioned, using default apify crawler") collect_using_apify = True if collect_using_apify: # Get directory name apify_storage_dir = os.path.join(".", "apify_storage") current_dir = pathlib.Path(__file__).parent.absolute() node_folder_path = os.path.join(os.path.expanduser("~"), ".alternat", "node_modules") os.environ["NODE_PATH"] = node_folder_path os.environ["APIFY_LOCAL_STORAGE_DIR"] = apify_storage_dir ## Try to remove tree; if failed show an error using try...except on screen try: shutil.rmtree(apify_storage_dir) except OSError as e: print ("Error: %s - %s." % (e.filename, e.strerror)) if download_recursive: # TODO remove slash to os.path # TODO remove path to current file script_path = os.path.join(current_dir, "apify", "main.js") output = subprocess.check_output(["node", script_path,]) else: print(f"Downloading single page", url) script_path = os.path.join(current_dir, "apify", "main_single_page.js") output = subprocess.check_output(["node", script_path,]) # TODO Remove print statement #print("output", output, flush=True) else: print("Set at least collect-using-apify or collect-using-scrapy to True") except subprocess.CalledProcessError as e: print("\n Subprocess error") print(str(e.output)) except Exception as e: print("\n\n\nException occurred doing cleanup\n\n\n") print(e) raise