import csv import json import logging import sqlite3 from dataclasses import dataclass from datetime import datetime import requests logFormatter = logging.Formatter("%(asctime)-15s [%(levelname)8s] [%(threadName)s] - %(message)s") LOG = logging.getLogger() LOG.setLevel(logging.DEBUG) fileHandler = logging.FileHandler("importer.log") fileHandler.setFormatter(logFormatter) LOG.addHandler(fileHandler) consoleHandler = logging.StreamHandler() consoleHandler.setFormatter(logFormatter) LOG.addHandler(consoleHandler) @dataclass class ApiExportFile: path: str download_url: str etag: str def get_online_files_list(subdir_filter=None, file_extension_filter=None): import urllib.parse import xml.etree.ElementTree base_uri = "https://s3-eu-west-1.amazonaws.com/cycling.data.tfl.gov.uk/" xml_data = xml.etree.ElementTree.fromstringlist(requests.get(base_uri).text) entries = [] for child in xml_data.findall('{http://s3.amazonaws.com/doc/2006-03-01/}Contents'): key = child.find('{http://s3.amazonaws.com/doc/2006-03-01/}Key').text etag = child.find('{http://s3.amazonaws.com/doc/2006-03-01/}ETag').text if key.endswith('/'): continue download_url = base_uri + urllib.parse.quote_plus(key, safe="/") entries.append( ApiExportFile(key, download_url, etag) ) if subdir_filter: entries = list(filter(lambda el: el.path.startswith(subdir_filter), entries)) if file_extension_filter: entries = list(filter(lambda el: el.path.endswith(file_extension_filter), entries)) return entries def init_database(): LOG.info("Try to create tables") conn = sqlite3.connect("bike-data.db", timeout=300) conn.execute("""CREATE TABLE IF NOT EXISTS usage_stats( rental_id INTEGER PRIMARY KEY, duration INTEGER, bike_id INTEGER, end_date INTEGER, end_station_id INTEGER, end_station_name TEXT, start_date INTEGER, start_station_id INTEGER, start_station_name TEXT )""") conn.execute("CREATE TABLE IF NOT EXISTS read_files(file_path TEXT, etag TEXT PRIMARY KEY)") conn.execute("""CREATE TABLE IF NOT EXISTS bike_points( id TEXT PRIMARY KEY, common_name TEXT, lat REAL, lon REAL, id_num INTEGER AS (CAST(SUBSTR(id, 12) as INTEGER)) STORED )""") conn.commit() conn.close() LOG.info("Tables created") def create_indexes(): LOG.info("Try to create indexes") conn = sqlite3.connect("bike-data.db", timeout=300) conn.execute("""CREATE INDEX IF NOT EXISTS idx_date_of_start_date ON usage_stats (date(start_date, "unixepoch"))""") conn.commit() conn.close() LOG.info("Indexes created") def import_bikepoints(): LOG.info("Importing bikepoints") conn = sqlite3.connect("bike-data.db", timeout=300) points = json.loads(requests.get("https://api.tfl.gov.uk/BikePoint").text) points = list(map(lambda p: (p['id'], p['commonName'], p['lat'], p['lon']), points)) conn.executemany("INSERT OR IGNORE INTO bike_points VALUES (?, ?, ?, ?)", points) conn.commit() conn.close() LOG.info("Bikepoints imported") def import_usage_stats_file(export_file: ApiExportFile): conn = sqlite3.connect("bike-data.db", timeout=300) rows = conn.execute("SELECT * FROM read_files WHERE etag LIKE ?", (export_file.etag,)).fetchall() if len(rows) != 0: LOG.warning(f"Skipping import of {export_file.path}") return LOG.info(f"DOWNLOADING... {export_file.download_url}") content = requests.get(export_file.download_url).content.decode("UTF-8") LOG.info(f"Parsing {export_file.path}") entries = list(csv.reader(content.splitlines()))[1:] mapped = [] for entry in entries: try: mapped.append(( # Rental Id int(entry[0]), # Duration oder Duration_Seconds int(entry[1] or "-1"), # Bike Id int(entry[2] or "-1"), # End Date int(datetime.strptime(entry[3][:16], "%d/%m/%Y %H:%M").timestamp()) if entry[3] else -1, # EndStation Id int(entry[4] or "-1"), # EndStation Name entry[5].strip(), # Start Date int(datetime.strptime(entry[6][:16], "%d/%m/%Y %H:%M").timestamp()) if entry[6] else -1, # StartStation Id int(entry[7]), # StartStation Name entry[8].strip() )) except ValueError as e: LOG.error(f"Value Error {e} on line {entry}") return except KeyError as e: LOG.error(f"Key Error {e} on line {entry}") return LOG.info(f"Writing {len(mapped)} entries to DB") conn.executemany("INSERT OR IGNORE INTO usage_stats VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)", mapped) conn.execute("INSERT OR IGNORE INTO read_files VALUES (?, ?)", (export_file.path, export_file.etag)) conn.commit() conn.close() LOG.info(f"Finished import of {export_file.path}") def main(): # General DB init init_database() # Download and import opendata from S3 bucket all_files = get_online_files_list(subdir_filter="usage-stats", file_extension_filter=".csv") for file in all_files: import_usage_stats_file(file) # Import Bikepoints import_bikepoints() # Create search-index for faster querying create_indexes() if __name__ == "__main__": main()