explore-app/load_csv_to_mysql.py at main · UKLLC/explore-app · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
###################################################
# purpose: load all metadata to MySQL db
# author: rich thomas
# date: 30/10/2023
##################################################

import pandas as pd
import os
import sys

import dataIO as io
from sqlalchemy import create_engine
import pymysql
import glob

# connection
def connect():
    # need to swap password for local var
    cnxn = create_engine('mysql+pymysql://bq21582:password_password@127.0.0.1:3306/ukllc')
    return(cnxn)

# make table
def make_table(df, table_name):
    # make table
    df.to_sql(table_name, cnxn, if_exists = 'replace', index = False)

# test connection
cnxn = connect()
#t2 = pd.read_sql('select * from t1', cnxn)

#
os.chdir('L:\\Data\\Dev\\programs\\web app')

# data request form load
study_drf = io.load_study_request()
# and create table
make_table(study_drf, 'drf_lps')

# data request form load
nhs_drf = io.load_linked_request()
# need to adjust/trim columns names
nhs_drf = nhs_drf.rename (columns={
    'Number of Participants Included (n=) \n(i.e. number of particpants with non-null data, and with UK LLC, and linkage permission)': 'Number of Participants Included',
    'Health Domain Groupings (i.e. covid infection, asthma, smoking, etc.) ' : 'Health Domain Groupings'
    })
# and strip whitespace
nhs_drf = nhs_drf.rename(columns = lambda x: x.strip())
# get rid of any NULL block names
nhs_drf = nhs_drf.dropna(subset = ['Block Name'])
# and create table
make_table(nhs_drf, 'drf_nhs')

# study info and links
study_info = io.load_study_info_and_links()
make_table(study_info, 'study_info')

# load all metadata
all_files = glob.glob('L:\\Data\\Dev\\programs\\web app')

for root, dirs, files in os.walk('metadata'):
    for name in files:
        fpath = os.path.join(root, name)
        data = pd.read_csv(fpath)
        tab_name = "metadata_"+root.split('\\')[1].lower() + '_' + name.split('.')[0].lower()
        make_table(data, tab_name)