#!/usr/bin/env python3 import time, os, sys, logging, glob, datetime, json from lib import utils, FileQueue from lib.sources import WestAllisSource, WestAllisSalesHistorySource from lib.parsers import WestAllisTax, WestAllisSalesHistory from lib.serializers import WestAllisSalesHistorySerializer from selenium.webdriver.common.desired_capabilities import DesiredCapabilities LOG_FORMAT = '%(levelname)s:%(name)s:%(asctime)-15s: %(message)s' def wa_ti_scrape(arguments): logging.basicConfig(format=LOG_FORMAT) logger = logging.getLogger('scrape') logger.setLevel(logging.INFO) # LOCAL or REMOTE DRIVER_TYPE = arguments.get('--driver-type') # required for REMOTE DRIVER_URL = arguments.get('--driver-url') # WEST_ALLIS, WEST_ALLIS_SALES_HISTORY ENDPOINT = arguments.get('--endpoint') TAX_IDS_FILEPATH = arguments.get('--tax-ids-filepath') CURRENT_INDEX_FILEPATH = arguments.get('--current-index-filepath') SECONDS_BETWEEN_REQUESTS = int(arguments.get('--seconds-between-requests')) OUTPUT_DIRECTORY = arguments.get('--output-directory') SOURCE_TYPES = { 'WEST_ALLIS': WestAllisSource, 'WEST_ALLIS_SALES_HISTORY': WestAllisSalesHistorySource } driver = utils.get_driver(DRIVER_TYPE, { 'command_executor': DRIVER_URL, 'desired_capabilities': DesiredCapabilities.FIREFOX }) q = FileQueue(TAX_IDS_FILEPATH, CURRENT_INDEX_FILEPATH) create_source = SOURCE_TYPES[ENDPOINT] source = create_source(driver) while True: next_id = q.peek() content = source.fetch(next_id) archive_path = '{}/{}_{}.html'.format(OUTPUT_DIRECTORY, int(time.time()), next_id) utils.write_archive(archive_path, content) logger.info('Recieved content for {} and saving to {}'.format(next_id, archive_path)) q.pop() time.sleep(SECONDS_BETWEEN_REQUESTS) def wa_ti_json(arguments): logging.basicConfig(format=LOG_FORMAT) logger = logging.getLogger('json') logger.setLevel(logging.INFO) ENDPOINT = arguments.get('--endpoint') HTML_DIR = arguments.get('--html-directory') SOURCE_TYPES = { 'WEST_ALLIS': WestAllisTax, 'WEST_ALLIS_SALES_HISTORY': WestAllisSalesHistory } create_parser = SOURCE_TYPES[ENDPOINT] files = glob.glob('{}/*.html'.format(HTML_DIR)) for fp in files: contents = None with open(fp, 'r') as f: contents = f.read() f.close() if contents == None: raise Exception('Contents cannot be none for {}'.format(fp)) filename = fp.split('/')[-1] filename = filename.split('.')[0] timestamp = int(filename.split('_')[0]) created_on = datetime.datetime.fromtimestamp(timestamp) parsed = create_parser(contents).parse(created_on=created_on) write_fp = '{}/{}.json'.format(HTML_DIR, filename) serialized = json.dumps(parsed) logger.info('Parsed content for {} and saving to {}'.format(filename, write_fp)) with open(write_fp, 'w') as f: f.write(serialized) f.close() def wa_ti_csv(arguments): logging.basicConfig(format=LOG_FORMAT) logger = logging.getLogger('json') logger.setLevel(logging.INFO) ENDPOINT = arguments.get('--endpoint') JSON_DIR = arguments.get('--json-directory') SOURCE_TYPES = { 'WEST_ALLIS_SALES_HISTORY': WestAllisSalesHistorySerializer } create_serializer = SOURCE_TYPES[ENDPOINT] serializer = create_serializer() files = glob.glob('{}/*.json'.format(JSON_DIR)) csv_data = '' for fp in files: contents = None with open(fp, 'r') as f: contents = f.read() f.close() if contents == None: raise Exception('Contents cannot be none for {}'.format(fp)) filename = fp.split('/')[-1] filename = filename.split('.')[0] tax_id = filename.split('_')[1] csv_data += serializer.serialize(tax_id, contents) write_fp = '{}/sales_history.csv'.format(JSON_DIR) logger.info('Serialized all json content and saving to {}'.format(write_fp)) with open(write_fp, 'w') as f: f.write(csv_data) f.close() operations = { 'scrape': wa_ti_scrape, 'json': wa_ti_json, 'csv': wa_ti_csv } def run(): sys.argv.pop(0) # pops script name operation = sys.argv.pop(0) arguments = {} while len(sys.argv) > 0: arg = sys.argv.pop() s = arg.split('=') topic = s[0] value = s[1] arguments[topic] = value callback = operations[operation](arguments) if __name__ == '__main__': run()