adds license
[west_allis_property_tax] / main
1 #!/usr/bin/env python3
2 import time, os, sys, logging, glob, datetime, json
3 from lib                import utils, FileQueue
4 from lib.sources        import WestAllisSource, WestAllisSalesHistorySource
5 from lib.parsers        import WestAllisTax, WestAllisSalesHistory
6 from lib.serializers    import WestAllisSalesHistorySerializer
7 from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
8
9 LOG_FORMAT = '%(levelname)s:%(name)s:%(asctime)-15s: %(message)s'
10
11 def wa_ti_scrape(arguments):
12     logging.basicConfig(format=LOG_FORMAT)
13     logger = logging.getLogger('scrape')
14     logger.setLevel(logging.INFO)
15
16     # LOCAL or REMOTE
17     DRIVER_TYPE                 = arguments.get('--driver-type')
18
19     # required for REMOTE
20     DRIVER_URL                  = arguments.get('--driver-url')
21
22     # WEST_ALLIS, WEST_ALLIS_SALES_HISTORY
23     ENDPOINT                    = arguments.get('--endpoint')
24
25     TAX_IDS_FILEPATH            = arguments.get('--tax-ids-filepath')
26     CURRENT_INDEX_FILEPATH      = arguments.get('--current-index-filepath')
27     SECONDS_BETWEEN_REQUESTS    = int(arguments.get('--seconds-between-requests'))
28     OUTPUT_DIRECTORY            = arguments.get('--output-directory')
29     SOURCE_TYPES                = {
30         'WEST_ALLIS': WestAllisSource,
31         'WEST_ALLIS_SALES_HISTORY': WestAllisSalesHistorySource
32     }
33
34     driver = utils.get_driver(DRIVER_TYPE, {
35         'command_executor': DRIVER_URL,
36         'desired_capabilities': DesiredCapabilities.FIREFOX
37     })
38
39     q = FileQueue(TAX_IDS_FILEPATH, CURRENT_INDEX_FILEPATH)
40     create_source = SOURCE_TYPES[ENDPOINT]
41     source = create_source(driver)
42
43     while True:
44         next_id = q.peek()
45         content = source.fetch(next_id)
46         archive_path = '{}/{}_{}.html'.format(OUTPUT_DIRECTORY, int(time.time()), next_id)
47         utils.write_archive(archive_path, content)
48         logger.info('Recieved content for {} and saving to {}'.format(next_id, archive_path))
49         q.pop()
50         time.sleep(SECONDS_BETWEEN_REQUESTS)
51
52 def wa_ti_json(arguments):
53     logging.basicConfig(format=LOG_FORMAT)
54     logger = logging.getLogger('json')
55     logger.setLevel(logging.INFO)
56
57     ENDPOINT                    = arguments.get('--endpoint')
58     HTML_DIR                    = arguments.get('--html-directory')
59     SOURCE_TYPES                = {
60         'WEST_ALLIS': WestAllisTax,
61         'WEST_ALLIS_SALES_HISTORY': WestAllisSalesHistory
62     }
63
64     create_parser = SOURCE_TYPES[ENDPOINT]
65
66     files = glob.glob('{}/*.html'.format(HTML_DIR))
67
68     for fp in files:
69         contents = None
70         with open(fp, 'r') as f:
71             contents = f.read()
72             f.close()
73
74         if contents == None:
75             raise Exception('Contents cannot be none for {}'.format(fp))
76
77         filename = fp.split('/')[-1]
78         filename = filename.split('.')[0]
79
80
81         timestamp = int(filename.split('_')[0])
82         created_on = datetime.datetime.fromtimestamp(timestamp)
83         parsed = create_parser(contents).parse(created_on=created_on)
84
85         write_fp = '{}/{}.json'.format(HTML_DIR, filename)
86         serialized = json.dumps(parsed)
87         logger.info('Parsed content for {} and saving to {}'.format(filename, write_fp))
88
89         with open(write_fp, 'w') as f:
90             f.write(serialized)
91             f.close()
92
93 def wa_ti_csv(arguments):
94     logging.basicConfig(format=LOG_FORMAT)
95     logger = logging.getLogger('json')
96     logger.setLevel(logging.INFO)
97
98     ENDPOINT                    = arguments.get('--endpoint')
99     JSON_DIR                    = arguments.get('--json-directory')
100     SOURCE_TYPES                = {
101         'WEST_ALLIS_SALES_HISTORY': WestAllisSalesHistorySerializer
102     }
103
104     create_serializer = SOURCE_TYPES[ENDPOINT]
105     serializer = create_serializer()
106     files = glob.glob('{}/*.json'.format(JSON_DIR))
107
108     csv_data = ''
109
110     for fp in files:
111         contents = None
112         with open(fp, 'r') as f:
113             contents = f.read()
114             f.close()
115
116         if contents == None:
117             raise Exception('Contents cannot be none for {}'.format(fp))
118
119         filename = fp.split('/')[-1]
120         filename = filename.split('.')[0]
121         tax_id = filename.split('_')[1]
122
123         csv_data += serializer.serialize(tax_id, contents)
124
125
126     write_fp = '{}/sales_history.csv'.format(JSON_DIR)
127     logger.info('Serialized all json content and saving to {}'.format(write_fp))
128
129     with open(write_fp, 'w') as f:
130         f.write(csv_data)
131         f.close()
132
133 operations = {
134     'scrape': wa_ti_scrape,
135     'json': wa_ti_json,
136     'csv': wa_ti_csv
137 }
138
139 def run():
140     sys.argv.pop(0) # pops script name
141     operation = sys.argv.pop(0)
142     arguments = {}
143
144     while len(sys.argv) > 0:
145         arg = sys.argv.pop()
146         s = arg.split('=')
147         topic = s[0]
148         value = s[1]
149         arguments[topic] = value
150     callback = operations[operation](arguments)
151
152 if __name__ == '__main__':
153     run()
154