2020-12-15 00:17:58 +01:00
|
|
|
import csv
|
2020-12-19 00:29:32 +01:00
|
|
|
import json
|
2020-12-15 00:17:58 +01:00
|
|
|
import logging
|
|
|
|
import sqlite3
|
2020-12-21 03:23:08 +01:00
|
|
|
import psycopg2
|
|
|
|
import psycopg2.extras
|
2020-12-14 01:44:06 +01:00
|
|
|
from dataclasses import dataclass
|
2020-12-15 00:17:58 +01:00
|
|
|
from datetime import datetime
|
2020-12-14 01:44:06 +01:00
|
|
|
|
2020-12-19 00:29:32 +01:00
|
|
|
import requests
|
|
|
|
|
2020-12-19 15:41:35 +01:00
|
|
|
DB_NAME = "bike-data.db"
|
|
|
|
|
2020-12-19 04:12:03 +01:00
|
|
|
logFormatter = logging.Formatter("%(asctime)-15s [%(levelname)8s] [%(threadName)s] - %(message)s")
|
|
|
|
LOG = logging.getLogger()
|
|
|
|
LOG.setLevel(logging.DEBUG)
|
2020-12-14 01:44:06 +01:00
|
|
|
|
2020-12-19 15:41:35 +01:00
|
|
|
fileHandler = logging.FileHandler("db_init.log")
|
2020-12-19 04:12:03 +01:00
|
|
|
fileHandler.setFormatter(logFormatter)
|
|
|
|
LOG.addHandler(fileHandler)
|
|
|
|
|
|
|
|
consoleHandler = logging.StreamHandler()
|
|
|
|
consoleHandler.setFormatter(logFormatter)
|
|
|
|
LOG.addHandler(consoleHandler)
|
2020-12-14 01:44:06 +01:00
|
|
|
|
|
|
|
|
|
|
|
@dataclass
|
|
|
|
class ApiExportFile:
|
|
|
|
path: str
|
|
|
|
download_url: str
|
|
|
|
etag: str
|
|
|
|
|
|
|
|
|
|
|
|
def get_online_files_list(subdir_filter=None, file_extension_filter=None):
|
|
|
|
import urllib.parse
|
|
|
|
import xml.etree.ElementTree
|
|
|
|
|
|
|
|
base_uri = "https://s3-eu-west-1.amazonaws.com/cycling.data.tfl.gov.uk/"
|
|
|
|
xml_data = xml.etree.ElementTree.fromstringlist(requests.get(base_uri).text)
|
|
|
|
entries = []
|
|
|
|
|
|
|
|
for child in xml_data.findall('{http://s3.amazonaws.com/doc/2006-03-01/}Contents'):
|
|
|
|
key = child.find('{http://s3.amazonaws.com/doc/2006-03-01/}Key').text
|
|
|
|
etag = child.find('{http://s3.amazonaws.com/doc/2006-03-01/}ETag').text
|
|
|
|
if key.endswith('/'):
|
|
|
|
continue
|
|
|
|
|
|
|
|
download_url = base_uri + urllib.parse.quote_plus(key, safe="/")
|
|
|
|
entries.append(
|
|
|
|
ApiExportFile(key, download_url, etag)
|
|
|
|
)
|
|
|
|
|
|
|
|
if subdir_filter:
|
|
|
|
entries = list(filter(lambda el: el.path.startswith(subdir_filter), entries))
|
|
|
|
|
|
|
|
if file_extension_filter:
|
|
|
|
entries = list(filter(lambda el: el.path.endswith(file_extension_filter), entries))
|
|
|
|
|
|
|
|
return entries
|
|
|
|
|
|
|
|
|
2020-12-19 04:12:03 +01:00
|
|
|
def init_database():
|
|
|
|
LOG.info("Try to create tables")
|
2020-12-21 03:23:08 +01:00
|
|
|
conn = get_conn()
|
|
|
|
cursor = conn.cursor()
|
|
|
|
cursor.execute("""CREATE TABLE IF NOT EXISTS usage_stats(
|
|
|
|
rental_id BIGINT PRIMARY KEY,
|
|
|
|
duration BIGINT,
|
|
|
|
bike_id BIGINT,
|
|
|
|
end_date TIMESTAMP,
|
|
|
|
end_station_id BIGINT,
|
2020-12-19 04:12:03 +01:00
|
|
|
end_station_name TEXT,
|
2020-12-21 03:23:08 +01:00
|
|
|
start_date TIMESTAMP,
|
|
|
|
start_station_id BIGINT,
|
2020-12-19 04:12:03 +01:00
|
|
|
start_station_name TEXT
|
|
|
|
)""")
|
2020-12-21 03:23:08 +01:00
|
|
|
cursor.execute("CREATE TABLE IF NOT EXISTS read_files(file_path TEXT, etag TEXT PRIMARY KEY)")
|
|
|
|
cursor.execute("""CREATE TABLE IF NOT EXISTS bike_points(
|
2020-12-19 04:12:03 +01:00
|
|
|
id TEXT PRIMARY KEY,
|
|
|
|
common_name TEXT,
|
|
|
|
lat REAL,
|
|
|
|
lon REAL,
|
2020-12-20 16:29:01 +01:00
|
|
|
id_num INTEGER
|
2020-12-19 04:12:03 +01:00
|
|
|
)""")
|
2020-12-21 03:23:08 +01:00
|
|
|
cursor.execute("""CREATE TABLE IF NOT EXISTS accidents(
|
2020-12-19 15:26:02 +01:00
|
|
|
id INTEGER PRIMARY KEY,
|
|
|
|
lat REAL,
|
|
|
|
lon REAL,
|
|
|
|
location TEXT,
|
2020-12-21 03:23:08 +01:00
|
|
|
date TIMESTAMP,
|
2020-12-19 15:26:02 +01:00
|
|
|
severity TEXT
|
|
|
|
)""")
|
2020-12-19 04:12:03 +01:00
|
|
|
conn.commit()
|
|
|
|
conn.close()
|
|
|
|
LOG.info("Tables created")
|
|
|
|
|
|
|
|
|
|
|
|
def create_indexes():
|
|
|
|
LOG.info("Try to create indexes")
|
2020-12-19 15:41:35 +01:00
|
|
|
conn = sqlite3.connect(DB_NAME, timeout=300)
|
2020-12-20 01:36:17 +01:00
|
|
|
LOG.info("Starting to build index: idx_date_of_start_date")
|
2020-12-19 04:12:03 +01:00
|
|
|
conn.execute("""CREATE INDEX IF NOT EXISTS idx_date_of_start_date
|
2020-12-20 19:15:25 +01:00
|
|
|
ON usage_stats (date(start_date, 'unixepoch'))""")
|
2020-12-19 04:12:03 +01:00
|
|
|
conn.commit()
|
2020-12-20 01:36:17 +01:00
|
|
|
LOG.info("Created index: idx_date_of_start_date")
|
|
|
|
LOG.info("Starting to build index: idx_end_station_id_date_of_start_date")
|
|
|
|
conn.execute("""CREATE INDEX IF NOT EXISTS "idx_end_station_id_date_of_start_date"
|
2020-12-20 19:15:25 +01:00
|
|
|
ON "usage_stats" ("end_station_id" ASC, date(start_date, 'unixepoch'))""")
|
2020-12-20 01:36:17 +01:00
|
|
|
conn.commit()
|
|
|
|
LOG.info("Created index: idx_end_station_id_date_of_start_date")
|
|
|
|
LOG.info("Starting to build index: idx_start_station_id_date_of_start_date")
|
|
|
|
conn.execute("""CREATE INDEX IF NOT EXISTS "idx_start_station_id_date_of_start_date"
|
2020-12-20 19:15:25 +01:00
|
|
|
ON "usage_stats" ("start_station_id" ASC, date("start_date", 'unixepoch'))""")
|
2020-12-20 01:36:17 +01:00
|
|
|
conn.commit()
|
|
|
|
LOG.info("Created index: idx_start_station_id_date_of_start_date")
|
2020-12-19 04:12:03 +01:00
|
|
|
conn.close()
|
|
|
|
LOG.info("Indexes created")
|
|
|
|
|
|
|
|
|
2020-12-20 19:15:25 +01:00
|
|
|
def create_dashboard_table():
|
|
|
|
LOG.info("Creating dashboard table")
|
|
|
|
conn = sqlite3.connect(DB_NAME, timeout=300)
|
|
|
|
conn.execute("DROP TABLE IF EXISTS dashboard")
|
|
|
|
conn.execute("""CREATE TABLE dashboard AS SELECT
|
|
|
|
b.id_num as id,
|
|
|
|
max(date(u.start_date, 'unixepoch')) AS max_end_date,
|
|
|
|
min(date(u.start_date, 'unixepoch')) AS max_start_date
|
|
|
|
FROM usage_stats u
|
|
|
|
JOIN bike_points b ON u.start_station_id = b.id_num
|
|
|
|
GROUP BY b.id_num""")
|
|
|
|
conn.commit()
|
|
|
|
LOG.info("Created dashboard table")
|
|
|
|
|
|
|
|
|
2020-12-19 04:12:03 +01:00
|
|
|
def import_bikepoints():
|
|
|
|
LOG.info("Importing bikepoints")
|
2020-12-21 03:23:08 +01:00
|
|
|
conn = get_conn()
|
|
|
|
cursor = conn.cursor()
|
|
|
|
|
2020-12-19 04:12:03 +01:00
|
|
|
points = json.loads(requests.get("https://api.tfl.gov.uk/BikePoint").text)
|
2020-12-20 16:29:01 +01:00
|
|
|
points = list(map(lambda p: (p['id'], p['commonName'], p['lat'], p['lon'], int(p['id'][11:])), points))
|
2020-12-19 15:26:02 +01:00
|
|
|
|
|
|
|
LOG.info(f"Writing {len(points)} bikepoints to DB")
|
2020-12-21 03:23:08 +01:00
|
|
|
cursor.executemany("INSERT INTO bike_points VALUES (%s, %s, %s, %s, %s) ON CONFLICT DO NOTHING", points)
|
2020-12-19 04:12:03 +01:00
|
|
|
conn.commit()
|
|
|
|
conn.close()
|
|
|
|
LOG.info("Bikepoints imported")
|
|
|
|
|
|
|
|
|
2020-12-19 15:26:02 +01:00
|
|
|
def import_accidents(year):
|
|
|
|
LOG.info("Importing accidents")
|
2020-12-21 03:23:08 +01:00
|
|
|
conn = get_conn()
|
|
|
|
cursor = conn.cursor()
|
2020-12-19 15:26:02 +01:00
|
|
|
|
|
|
|
def filter_pedal_cycles(accident):
|
|
|
|
for vehicle in accident['vehicles']:
|
|
|
|
if vehicle['type'] == "PedalCycle":
|
|
|
|
return True
|
|
|
|
return False
|
|
|
|
|
|
|
|
accidents = requests.get(f"https://api.tfl.gov.uk/AccidentStats/{year}").text
|
|
|
|
accidents = json.loads(accidents)
|
|
|
|
accidents = list(filter(filter_pedal_cycles, accidents))
|
|
|
|
accidents = list(map(lambda a: (a['id'], a['lat'], a['lon'], a['location'], a['date'], a['severity']), accidents))
|
|
|
|
|
|
|
|
LOG.info(f"Writing {len(accidents)} bike accidents to DB")
|
2020-12-21 03:23:08 +01:00
|
|
|
cursor.executemany("INSERT INTO accidents VALUES (%s, %s, %s, %s, %s, %s) ON CONFLICT DO NOTHING", accidents)
|
|
|
|
|
2020-12-19 15:26:02 +01:00
|
|
|
conn.commit()
|
|
|
|
conn.close()
|
|
|
|
LOG.info("Accidents importet")
|
|
|
|
|
|
|
|
|
2020-12-19 04:12:03 +01:00
|
|
|
def import_usage_stats_file(export_file: ApiExportFile):
|
2020-12-21 03:23:08 +01:00
|
|
|
conn = get_conn()
|
|
|
|
cursor = conn.cursor()
|
2020-12-19 04:12:03 +01:00
|
|
|
|
2020-12-21 03:23:08 +01:00
|
|
|
cursor.execute("SELECT * FROM read_files WHERE etag = %s", (export_file.etag,))
|
|
|
|
if len(cursor.fetchall()) != 0:
|
2020-12-19 04:12:03 +01:00
|
|
|
LOG.warning(f"Skipping import of {export_file.path}")
|
|
|
|
return
|
|
|
|
|
|
|
|
LOG.info(f"DOWNLOADING... {export_file.download_url}")
|
|
|
|
content = requests.get(export_file.download_url).content.decode("UTF-8")
|
|
|
|
|
|
|
|
LOG.info(f"Parsing {export_file.path}")
|
|
|
|
entries = list(csv.reader(content.splitlines()))[1:]
|
|
|
|
mapped = []
|
|
|
|
for entry in entries:
|
|
|
|
try:
|
|
|
|
mapped.append((
|
|
|
|
# Rental Id
|
|
|
|
int(entry[0]),
|
|
|
|
# Duration oder Duration_Seconds
|
|
|
|
int(entry[1] or "-1"),
|
|
|
|
# Bike Id
|
|
|
|
int(entry[2] or "-1"),
|
|
|
|
# End Date
|
2020-12-21 03:23:08 +01:00
|
|
|
datetime.strptime(entry[3][:16], "%d/%m/%Y %H:%M") if entry[3] else None,
|
2020-12-19 04:12:03 +01:00
|
|
|
# EndStation Id
|
|
|
|
int(entry[4] or "-1"),
|
|
|
|
# EndStation Name
|
|
|
|
entry[5].strip(),
|
|
|
|
# Start Date
|
2020-12-21 03:23:08 +01:00
|
|
|
datetime.strptime(entry[6][:16], "%d/%m/%Y %H:%M") if entry[6] else None,
|
2020-12-19 04:12:03 +01:00
|
|
|
# StartStation Id
|
|
|
|
int(entry[7]),
|
|
|
|
# StartStation Name
|
|
|
|
entry[8].strip()
|
|
|
|
))
|
|
|
|
except ValueError as e:
|
|
|
|
LOG.error(f"Value Error {e} on line {entry}")
|
2020-12-15 00:17:58 +01:00
|
|
|
return
|
2020-12-19 04:12:03 +01:00
|
|
|
except KeyError as e:
|
|
|
|
LOG.error(f"Key Error {e} on line {entry}")
|
|
|
|
return
|
|
|
|
LOG.info(f"Writing {len(mapped)} entries to DB")
|
2020-12-21 03:23:08 +01:00
|
|
|
psycopg2.extras.execute_values(cursor, "INSERT INTO usage_stats VALUES %s ON CONFLICT DO NOTHING ", mapped, page_size=1_000_000)
|
|
|
|
cursor.execute("INSERT INTO read_files VALUES (%s, %s) ON CONFLICT DO NOTHING", (export_file.path, export_file.etag))
|
2020-12-19 04:12:03 +01:00
|
|
|
conn.commit()
|
|
|
|
LOG.info(f"Finished import of {export_file.path}")
|
2020-12-14 01:44:06 +01:00
|
|
|
|
|
|
|
|
2020-12-21 03:23:08 +01:00
|
|
|
def get_conn():
|
|
|
|
return psycopg2.connect(
|
|
|
|
host="localhost",
|
|
|
|
database="postgres",
|
|
|
|
user="postgres",
|
|
|
|
password="supersecure"
|
|
|
|
)
|
|
|
|
|
|
|
|
|
2020-12-14 01:44:06 +01:00
|
|
|
def main():
|
2020-12-19 00:29:32 +01:00
|
|
|
# General DB init
|
2020-12-19 04:12:03 +01:00
|
|
|
init_database()
|
2020-12-21 03:23:08 +01:00
|
|
|
import_accidents(2019)
|
|
|
|
import_bikepoints()
|
2020-12-14 01:44:06 +01:00
|
|
|
|
2020-12-21 03:23:08 +01:00
|
|
|
# count_pre = sqlite3.connect(DB_NAME, timeout=300).execute("SELECT count(*) FROM usage_stats").fetchone()[0]
|
|
|
|
#
|
2020-12-19 00:29:32 +01:00
|
|
|
# Download and import opendata from S3 bucket
|
2020-12-19 04:12:03 +01:00
|
|
|
all_files = get_online_files_list(subdir_filter="usage-stats", file_extension_filter=".csv")
|
|
|
|
for file in all_files:
|
|
|
|
import_usage_stats_file(file)
|
2020-12-21 03:23:08 +01:00
|
|
|
#
|
|
|
|
# count_after = sqlite3.connect(DB_NAME, timeout=300).execute("SELECT count(*) FROM usage_stats").fetchone()[0]
|
|
|
|
#
|
|
|
|
# # Create search-index for faster querying
|
|
|
|
# create_indexes()
|
|
|
|
# # Import Bikepoints
|
|
|
|
# import_bikepoints()
|
|
|
|
# # Import bike accidents
|
|
|
|
# import_accidents(2019)
|
|
|
|
#
|
|
|
|
# if count_after - count_pre > 0:
|
|
|
|
# create_dashboard_table()
|
2020-12-20 19:15:25 +01:00
|
|
|
|
2020-12-14 01:44:06 +01:00
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
main()
|