Implement first working database init script

2020-12-14 01:44:06 +01:00 · 2020-12-14 01:44:06 +01:00 · 7ad0df77cd
commit 7ad0df77cd
parent e0ddbaa24b
4 changed files with 148 additions and 91 deletions
--- a/projects/project-3/openapi/.gitignore
+++ b/projects/project-3/openapi/.gitignore
@ -0,0 +1,2 @@
 tmp/
 bike-data.db
--- a/projects/project-3/openapi/db_init.py
+++ b/projects/project-3/openapi/db_init.py
@ -0,0 +1,146 @@
 import os.path
 import shutil
 from dataclasses import dataclass
 import logging
 WORKING_DIR = os.getcwd()
 TMP_DIR = os.path.join(WORKING_DIR, "tmp")
 logging.basicConfig(format="%(asctime)-15s [%(levelname)8s] - %(message)s", level=logging.ERROR)
 LOG = logging.getLogger("Importer")
@dataclass
 class ApiExportFile:
    path: str
    download_url: str
    etag: str
 def get_online_files_list(subdir_filter=None, file_extension_filter=None):
    import urllib.parse
    import xml.etree.ElementTree
    import requests
    base_uri = "https://s3-eu-west-1.amazonaws.com/cycling.data.tfl.gov.uk/"
    xml_data = xml.etree.ElementTree.fromstringlist(requests.get(base_uri).text)
    entries = []
    for child in xml_data.findall('{http://s3.amazonaws.com/doc/2006-03-01/}Contents'):
        key = child.find('{http://s3.amazonaws.com/doc/2006-03-01/}Key').text
        etag = child.find('{http://s3.amazonaws.com/doc/2006-03-01/}ETag').text
        if key.endswith('/'):
            continue
        download_url = base_uri + urllib.parse.quote_plus(key, safe="/")
        entries.append(
            ApiExportFile(key, download_url, etag)
        )
    if subdir_filter:
        entries = list(filter(lambda el: el.path.startswith(subdir_filter), entries))
    if file_extension_filter:
        entries = list(filter(lambda el: el.path.endswith(file_extension_filter), entries))
    return entries
 def download_file(url, save_path):
    import os.path
    import urllib.request
    save_path = os.path.join(TMP_DIR, save_path)
    if os.path.exists(save_path):
        LOG.warning(f"Skipping exists: {save_path}")
        return
    os.makedirs(os.path.dirname(save_path), exist_ok=True)
    LOG.info(f"DOWNLOADING... {url} to {save_path}")
    urllib.request.urlretrieve(url, save_path)
 class BikeDatabase:
    def __init__(self):
        import sqlite3
        self.conn = sqlite3.connect("bike-data.db")
        self.cursor = self.conn.cursor()
    def init_database(self):
        self.conn.execute("""CREATE TABLE IF NOT EXISTS usage_stats(
                rental_id INTEGER PRIMARY KEY,
                duration INTEGER,
                bike_id INTEGER,
                end_date INTEGER,
                end_station_id INTEGER,
                end_station_name TEXT,
                start_date INTEGER,
                start_station_id INTEGER,
                start_station_name TEXT
            )""")
        self.conn.execute("CREATE TABLE IF NOT EXISTS read_files(file_path TEXT, etag TEXT PRIMARY KEY)")
    def is_file_already_imported(self, etag):
        rows = self.conn.execute("SELECT * FROM read_files WHERE etag LIKE ?", (etag,)).fetchall()
        return len(rows) != 0
    def import_usage_stats_file(self, export_file: ApiExportFile):
        from datetime import datetime
        import csv
        import os
        os.chdir(TMP_DIR)
        LOG.info(f"Importing {export_file.path}")
        with open(export_file.path, "r", newline='') as file:
            LOG.info(f"Reading file {export_file.path}")
            entries = list(csv.DictReader(file))
            mapped = []
            for entry in entries:
                try:
                    mapped.append((
                        int(entry['Rental Id']),
                        int(entry['Duration'] or "-1"),
                        int(entry['Bike Id'] or "-1"),
                        int(datetime.strptime(entry['End Date'][:16], "%d/%m/%Y %H:%M").timestamp()) if entry['End Date'] else -1,
                        int(entry['EndStation Id'] or "-1"),
                        entry['EndStation Name'],
                        int(datetime.strptime(entry['Start Date'][:16], "%d/%m/%Y %H:%M").timestamp()) if entry['Start Date'] else -1,
                        int(entry['StartStation Id']),
                        entry['StartStation Name']
                    ))
                except ValueError as e:
                    LOG.error(f"Value Error {e} on line {entry}")
                    return
                except KeyError as e:
                    LOG.error(f"Key Error {e} on line {entry}")
                    return
            LOG.info(f"Writing {len(mapped)} entries to DB")
            self.cursor.executemany("INSERT INTO usage_stats VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)", mapped)
            self.cursor.execute("INSERT INTO read_files VALUES (?, ?)", (export_file.path, export_file.etag))
        self.conn.commit()
        LOG.info(f"Finished import of {export_file.path}")
        os.chdir(WORKING_DIR)
        shutil.rmtree(TMP_DIR)
        LOG.info("Deleted temp dir")
 def main():
    all_files = get_online_files_list(subdir_filter="usage-stats", file_extension_filter=".csv")
    db = BikeDatabase()
    db.init_database()
    for file in all_files:
        if not db.is_file_already_imported(file.etag):
            download_file(file.download_url, file.path)
            db.import_usage_stats_file(file)
        else:
            LOG.warning(f"Skipping import of {file.path}")
 if __name__ == "__main__":
    main()
--- a/projects/project-3/openapi/downloader.py
+++ b/projects/project-3/openapi/downloader.py
@ -1,34 +0,0 @@
 import urllib.parse
 import urllib.request
 import xml.etree.ElementTree
 import os.path
 import requests
 BASE_URI = "https://s3-eu-west-1.amazonaws.com/cycling.data.tfl.gov.uk/"
 BASE_DIR = "data/"
 xml_data = xml.etree.ElementTree.fromstringlist(requests.get(BASE_URI).text)
 for child in xml_data.findall('{http://s3.amazonaws.com/doc/2006-03-01/}Contents'):
    key = child.find('{http://s3.amazonaws.com/doc/2006-03-01/}Key').text
    if key.endswith('/'):
        continue
    parts = key.rsplit('/')
    # create download url
    parts.append(urllib.parse.quote_plus(parts.pop()))
    download_url = BASE_URI + "/".join(parts)
    # create folders and files
    parts.append(urllib.parse.unquote_plus(parts.pop()))
    save_path = BASE_DIR + "/".join(parts)
    os.makedirs(os.path.dirname(save_path), exist_ok=True)
    # skip already downloaded files
    if os.path.exists(save_path):
        continue
    # do the download
    urllib.request.urlretrieve(download_url, save_path)
    print(f"DOWNLOADING... {download_url} to {save_path}")
--- a/projects/project-3/openapi/read_csv.py
+++ b/projects/project-3/openapi/read_csv.py
@ -1,57 +0,0 @@
 import csv
 import os
 def read_files(base_dir, max_num=2):
    data = []
    for file in os.listdir(base_dir):
        if max_num == 0:
            break
        print(f'reading file \'{file}\'')
        with open(os.path.join(base_dir, file), 'r', newline='') as input_file:
            data.extend(list(csv.DictReader(input_file)))
        max_num -= 1
    return data
 def get_key_counts(data):
    all_key_sets = [e.keys() for e in data]
    key_counts = {}
    for key_set in all_key_sets:
        for key in key_set:
            try:
                key_counts[key] += 1
            except KeyError:
                key_counts[key] = 1
    return key_counts
 def main():
    data = read_files('data/usage-stats', max_num=10)
    print("Sorting")
    data = sorted(data, key=lambda entry: int(entry['Rental Id']))
    print(f'final length of data {len(data)}')
    # counts = get_key_counts(data)
    # for count, val in counts.items():
    #     if val != len(data):
    #         print(count, val)
    # print(counts)
    with open('test.csv', 'w', newline='') as out_file:
        writer = csv.DictWriter(out_file, data[0].keys(), extrasaction='ignore')
        writer.writeheader()
        writer.writerows(data)
 if __name__ == '__main__':
    main()
 #
 # with open('test.csv') as again_in:
 #     reader2 = csv.DictReader(again_in)
 #     data2 = list(reader2)
 #
 # print(data2[0])
 #
 # print(f"Is same? {data[0] == data2[0]}")