diff --git a/hw11/README.md b/hw11/README.md
new file mode 100644
index 0000000..4bfcb82
--- /dev/null
+++ b/hw11/README.md
@@ -0,0 +1,13 @@
+# HW11 - Working with SQL databases via python
+Here we cover the following things:
+* Creation of SQL databases
+* Selection and insertion of information
+* Joining several tables via foreign key and fetching information from them
+
+## Data
+We will work with data about SNPs. But initial datasets were too large to upload them on GitHub.
+So to run this notebook initial datasets should be downloaded manually via [link](https://drive.google.com/file/d/1NWIT8Yn-GdgpBUfFO87dnIDQgmE5nj-j/view?usp=sharing)
+and placed in a folder `../data`.
+
+
+
diff --git a/hw11/databases.ipynb b/hw11/databases.ipynb
new file mode 100644
index 0000000..d135f1f
--- /dev/null
+++ b/hw11/databases.ipynb
@@ -0,0 +1,737 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Working with databases via `sqlite3`"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Initial datasets are too large to upload them on GitHub.
\n",
+ "So to run this notebook initial datasets should be downloaded manually via [link](https://drive.google.com/file/d/1NWIT8Yn-GdgpBUfFO87dnIDQgmE5nj-j/view?usp=sharing) and placed in a folder `../data`.
"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-03-17T21:35:36.932795Z",
+ "start_time": "2022-03-17T21:35:36.784953Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "import pandas as pd\n",
+ "import sqlite3\n",
+ "from IPython.display import display\n",
+ "import warnings \n",
+ "warnings.simplefilter('ignore')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Look at our dataframes"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-03-17T21:35:41.803479Z",
+ "start_time": "2022-03-17T21:35:37.869965Z"
+ }
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "metadata shape: (841, 4)\n",
+ "genstudio shape: (2000000, 20)\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Unnamed: 0 | \n",
+ " dna_chip_id | \n",
+ " breed | \n",
+ " sex | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 0 | \n",
+ " 202290551164R09C01 | \n",
+ " Д | \n",
+ " Хр | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 1 | \n",
+ " 202341831114R02C01 | \n",
+ " Д | \n",
+ " Хр | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 2 | \n",
+ " 202341831114R03C01 | \n",
+ " Д | \n",
+ " Хр | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 3 | \n",
+ " 202341831114R04C01 | \n",
+ " Д | \n",
+ " Хр | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 4 | \n",
+ " 202290551140R01C01 | \n",
+ " Д | \n",
+ " Хр | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Unnamed: 0 dna_chip_id breed sex\n",
+ "0 0 202290551164R09C01 Д Хр\n",
+ "1 1 202341831114R02C01 Д Хр\n",
+ "2 2 202341831114R03C01 Д Хр\n",
+ "3 3 202341831114R04C01 Д Хр\n",
+ "4 4 202290551140R01C01 Д Хр"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Unnamed: 0 | \n",
+ " SNP Name | \n",
+ " SNP Index | \n",
+ " SNP Aux | \n",
+ " Sample ID | \n",
+ " SNP | \n",
+ " Allele1 - Top | \n",
+ " Allele2 - Top | \n",
+ " Allele1 - Forward | \n",
+ " Allele2 - Forward | \n",
+ " Allele1 - AB | \n",
+ " Allele2 - AB | \n",
+ " Chr | \n",
+ " Position | \n",
+ " GC Score | \n",
+ " GT Score | \n",
+ " Theta | \n",
+ " R | \n",
+ " B Allele Freq | \n",
+ " Log R Ratio | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 0 | \n",
+ " 1_10573221 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 202341831114R01C01 | \n",
+ " [T/C] | \n",
+ " - | \n",
+ " - | \n",
+ " - | \n",
+ " - | \n",
+ " - | \n",
+ " - | \n",
+ " 1 | \n",
+ " 10573221 | \n",
+ " 0.0000 | \n",
+ " 0.0000 | \n",
+ " 0.942 | \n",
+ " 0.413 | \n",
+ " 1.0000 | \n",
+ " 0.4040 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 1 | \n",
+ " 1_10673082 | \n",
+ " 2 | \n",
+ " 0 | \n",
+ " 202341831114R01C01 | \n",
+ " [T/C] | \n",
+ " A | \n",
+ " A | \n",
+ " T | \n",
+ " T | \n",
+ " A | \n",
+ " A | \n",
+ " 1 | \n",
+ " 10673082 | \n",
+ " 0.8272 | \n",
+ " 0.8076 | \n",
+ " 0.039 | \n",
+ " 0.968 | \n",
+ " 0.0000 | \n",
+ " 0.3017 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 2 | \n",
+ " 1_10723065 | \n",
+ " 3 | \n",
+ " 0 | \n",
+ " 202341831114R01C01 | \n",
+ " [A/G] | \n",
+ " A | \n",
+ " A | \n",
+ " T | \n",
+ " T | \n",
+ " A | \n",
+ " A | \n",
+ " 1 | \n",
+ " 10723065 | \n",
+ " 0.8316 | \n",
+ " 0.8107 | \n",
+ " 0.011 | \n",
+ " 1.577 | \n",
+ " 0.0000 | \n",
+ " 0.0388 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 3 | \n",
+ " 1_11337555 | \n",
+ " 4 | \n",
+ " 0 | \n",
+ " 202341831114R01C01 | \n",
+ " [A/G] | \n",
+ " A | \n",
+ " A | \n",
+ " T | \n",
+ " T | \n",
+ " A | \n",
+ " A | \n",
+ " 1 | \n",
+ " 11337555 | \n",
+ " 0.3781 | \n",
+ " 0.7925 | \n",
+ " 0.045 | \n",
+ " 1.104 | \n",
+ " 0.0000 | \n",
+ " 0.2761 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 4 | \n",
+ " 1_11407894 | \n",
+ " 5 | \n",
+ " 0 | \n",
+ " 202341831114R01C01 | \n",
+ " [A/G] | \n",
+ " G | \n",
+ " G | \n",
+ " G | \n",
+ " G | \n",
+ " B | \n",
+ " B | \n",
+ " 1 | \n",
+ " 11407894 | \n",
+ " 0.9038 | \n",
+ " 0.8670 | \n",
+ " 0.983 | \n",
+ " 1.122 | \n",
+ " 0.9994 | \n",
+ " 0.0022 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Unnamed: 0 SNP Name SNP Index SNP Aux Sample ID SNP \\\n",
+ "0 0 1_10573221 1 0 202341831114R01C01 [T/C] \n",
+ "1 1 1_10673082 2 0 202341831114R01C01 [T/C] \n",
+ "2 2 1_10723065 3 0 202341831114R01C01 [A/G] \n",
+ "3 3 1_11337555 4 0 202341831114R01C01 [A/G] \n",
+ "4 4 1_11407894 5 0 202341831114R01C01 [A/G] \n",
+ "\n",
+ " Allele1 - Top Allele2 - Top Allele1 - Forward Allele2 - Forward \\\n",
+ "0 - - - - \n",
+ "1 A A T T \n",
+ "2 A A T T \n",
+ "3 A A T T \n",
+ "4 G G G G \n",
+ "\n",
+ " Allele1 - AB Allele2 - AB Chr Position GC Score GT Score Theta R \\\n",
+ "0 - - 1 10573221 0.0000 0.0000 0.942 0.413 \n",
+ "1 A A 1 10673082 0.8272 0.8076 0.039 0.968 \n",
+ "2 A A 1 10723065 0.8316 0.8107 0.011 1.577 \n",
+ "3 A A 1 11337555 0.3781 0.7925 0.045 1.104 \n",
+ "4 B B 1 11407894 0.9038 0.8670 0.983 1.122 \n",
+ "\n",
+ " B Allele Freq Log R Ratio \n",
+ "0 1.0000 0.4040 \n",
+ "1 0.0000 0.3017 \n",
+ "2 0.0000 0.0388 \n",
+ "3 0.0000 0.2761 \n",
+ "4 0.9994 0.0022 "
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "df_metadata = pd.read_csv('../data/metadata.csv')\n",
+ "df_genstudio = pd.read_csv('../data/genstudio.csv')\n",
+ "print('metadata shape:', df_metadata.shape)\n",
+ "print('genstudio shape:', df_genstudio.shape)\n",
+ "display(df_metadata.head(), df_genstudio.head())"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Column `Sample ID` contains the same values as `dna_chip_id` => we will use them to connect two tables"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-03-17T21:35:45.272328Z",
+ "start_time": "2022-03-17T21:35:45.074296Z"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "True"
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "all(df_genstudio['Sample ID'].isin(df_metadata['dna_chip_id']))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Create two databases"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-03-17T21:35:46.873017Z",
+ "start_time": "2022-03-17T21:35:46.851108Z"
+ }
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "RangeIndex: 841 entries, 0 to 840\n",
+ "Data columns (total 4 columns):\n",
+ " # Column Non-Null Count Dtype \n",
+ "--- ------ -------------- ----- \n",
+ " 0 Unnamed: 0 841 non-null int64 \n",
+ " 1 dna_chip_id 841 non-null object\n",
+ " 2 breed 841 non-null object\n",
+ " 3 sex 841 non-null object\n",
+ "dtypes: int64(1), object(3)\n",
+ "memory usage: 26.4+ KB\n"
+ ]
+ }
+ ],
+ "source": [
+ "df_metadata.info()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-03-17T21:35:48.427679Z",
+ "start_time": "2022-03-17T21:35:48.401110Z"
+ }
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "RangeIndex: 2000000 entries, 0 to 1999999\n",
+ "Data columns (total 20 columns):\n",
+ " # Column Dtype \n",
+ "--- ------ ----- \n",
+ " 0 Unnamed: 0 int64 \n",
+ " 1 SNP Name object \n",
+ " 2 SNP Index int64 \n",
+ " 3 SNP Aux int64 \n",
+ " 4 Sample ID object \n",
+ " 5 SNP object \n",
+ " 6 Allele1 - Top object \n",
+ " 7 Allele2 - Top object \n",
+ " 8 Allele1 - Forward object \n",
+ " 9 Allele2 - Forward object \n",
+ " 10 Allele1 - AB object \n",
+ " 11 Allele2 - AB object \n",
+ " 12 Chr object \n",
+ " 13 Position object \n",
+ " 14 GC Score float64\n",
+ " 15 GT Score float64\n",
+ " 16 Theta float64\n",
+ " 17 R float64\n",
+ " 18 B Allele Freq float64\n",
+ " 19 Log R Ratio float64\n",
+ "dtypes: float64(6), int64(3), object(11)\n",
+ "memory usage: 305.2+ MB\n"
+ ]
+ }
+ ],
+ "source": [
+ "df_genstudio.info()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-03-17T21:35:50.283080Z",
+ "start_time": "2022-03-17T21:35:50.260302Z"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "connection = sqlite3.connect('../data/my_database.db')\n",
+ "create_metadata = '''CREATE TABLE metadata(\n",
+ " Id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT,\n",
+ " dna_chip_id TEXT,\n",
+ " breed TEXT,\n",
+ " sex TEXT)\n",
+ " '''\n",
+ "\n",
+ "create_genstudio = '''CREATE TABLE genstudio(\n",
+ " Id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT,\n",
+ " SNP_name TEXT,\n",
+ " SNP_index INTEGER,\n",
+ " SNP_Aux INTEGER,\n",
+ " dna_chip_id TEXT,\n",
+ " SNP TEXT,\n",
+ " Allele1_Top TEXT,\n",
+ " Allele2_Top TEXT,\n",
+ " Allele1_Forward TEXT, \n",
+ " Allele2_Forward TEXT,\n",
+ " Allele1_AB TEXT,\n",
+ " Allele2_AB TEXT,\n",
+ " Chr TEXT,\n",
+ " Position TEXT,\n",
+ " GC_Score NUMERIC,\n",
+ " GT_Score NUMERIC,\n",
+ " Theta NUMERIC,\n",
+ " R NUMERIC,\n",
+ " B_Allele_Freq NUMERIC,\n",
+ " Log_R_Ration NUMERIC,\n",
+ " FOREIGN KEY (dna_chip_id) REFERENCES metadata(dna_chip_id))\n",
+ " '''\n",
+ "connection.execute(create_metadata)\n",
+ "connection.execute(create_genstudio)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Fill tables with values"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-03-17T21:38:04.502589Z",
+ "start_time": "2022-03-17T21:37:58.241842Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "insertion_metadata = '''INSERT INTO metadata(Id,\n",
+ " dna_chip_id,\n",
+ " breed,\n",
+ " sex)\n",
+ " VALUES(?,?,?,?)'''\n",
+ "\n",
+ "insertion_genstudio = '''INSERT INTO genstudio(Id,\n",
+ " SNP_name,\n",
+ " SNP_index,\n",
+ " SNP_Aux,\n",
+ " dna_chip_id,\n",
+ " SNP,\n",
+ " Allele1_Top,\n",
+ " Allele2_Top,\n",
+ " Allele1_Forward, \n",
+ " Allele2_Forward,\n",
+ " Allele1_AB,\n",
+ " Allele2_AB,\n",
+ " Chr,\n",
+ " Position,\n",
+ " GC_Score,\n",
+ " GT_Score,\n",
+ " Theta,\n",
+ " R,\n",
+ " B_Allele_Freq,\n",
+ " Log_R_Ration)\n",
+ " VALUES(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)'''\n",
+ "\n",
+ "\n",
+ "connection.executemany(insertion_metadata, df_metadata.values)\n",
+ "connection.executemany(insertion_genstudio, df_genstudio.values)\n",
+ "connection.commit()\n",
+ "connection.close()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Test some commands"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-03-17T21:39:09.063249Z",
+ "start_time": "2022-03-17T21:39:09.060063Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "connection = sqlite3.connect('../data/my_database.db')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-03-17T21:39:09.789925Z",
+ "start_time": "2022-03-17T21:39:09.774523Z"
+ }
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "(1, '202341831114R01C01', '1_10673082', '[T/C]')\n",
+ "(2, '202341831114R01C01', '1_10723065', '[A/G]')\n",
+ "(4, '202341831114R01C01', '1_11407894', '[A/G]')\n",
+ "(5, '202341831114R01C01', '1_11426075', '[T/C]')\n",
+ "(8, '202341831114R01C01', '1_13996200', '[T/C]')\n",
+ "(9, '202341831114R01C01', '1_142535524', '[A/G]')\n",
+ "(10, '202341831114R01C01', '1_14638936', '[T/C]')\n",
+ "(11, '202341831114R01C01', '1_161891709', '[A/G]')\n",
+ "(12, '202341831114R01C01', '1_17346505', '[A/G]')\n",
+ "(13, '202341831114R01C01', '1_17537210', '[T/C]')\n",
+ "(15, '202341831114R01C01', '1_242598', '[A/G]')\n",
+ "(16, '202341831114R01C01', '1_2463520', '[A/G]')\n",
+ "(18, '202341831114R01C01', '1_286337402', '[A/G]')\n",
+ "(19, '202341831114R01C01', '1_294072400', '[T/C]')\n",
+ "(22, '202341831114R01C01', '1_303127440', '[A/G]')\n"
+ ]
+ }
+ ],
+ "source": [
+ "query = '''SELECT Id, dna_chip_id, SNP_name, SNP FROM genstudio\n",
+ " WHERE genstudio.GC_score > 0.5'''\n",
+ "rows = connection.execute(query).fetchmany(15)\n",
+ "for row in rows:\n",
+ " print(row)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-03-17T21:39:55.848388Z",
+ "start_time": "2022-03-17T21:39:53.943812Z"
+ }
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "(1, '202341831114R02C01', 'Д', 'Хр', '10_11524303', '[A/G]', '10')\n",
+ "(1, '202341831114R02C01', 'Д', 'Хр', '10_15026457', '[T/C]', '10')\n",
+ "(1, '202341831114R02C01', 'Д', 'Хр', '10_15270724', '[A/G]', '10')\n",
+ "(1, '202341831114R02C01', 'Д', 'Хр', '10_17565103', '[A/G]', '10')\n",
+ "(1, '202341831114R02C01', 'Д', 'Хр', '10_17770477', '[A/G]', '10')\n",
+ "(1, '202341831114R02C01', 'Д', 'Хр', '10_18682426', '[T/C]', '10')\n",
+ "(1, '202341831114R02C01', 'Д', 'Хр', '10_19324750', '[T/G]', '10')\n",
+ "(1, '202341831114R02C01', 'Д', 'Хр', '10_20043953', '[T/C]', '10')\n",
+ "(1, '202341831114R02C01', 'Д', 'Хр', '10_20166080', '[A/C]', '10')\n",
+ "(1, '202341831114R02C01', 'Д', 'Хр', '10_2598359', '[A/G]', '10')\n",
+ "(1, '202341831114R02C01', 'Д', 'Хр', '10_27014378', '[T/C]', '10')\n",
+ "(1, '202341831114R02C01', 'Д', 'Хр', '10_27620869', '[A/G]', '10')\n",
+ "(1, '202341831114R02C01', 'Д', 'Хр', '10_28797811', '[T/C]', '10')\n",
+ "(1, '202341831114R02C01', 'Д', 'Хр', '10_28894311', '[T/C]', '10')\n",
+ "(1, '202341831114R02C01', 'Д', 'Хр', '10_29099121', '[T/C]', '10')\n",
+ "(1, '202341831114R02C01', 'Д', 'Хр', '10_29176531', '[T/C]', '10')\n",
+ "(1, '202341831114R02C01', 'Д', 'Хр', '10_29353382', '[A/G]', '10')\n",
+ "(1, '202341831114R02C01', 'Д', 'Хр', '10_34720438', '[A/C]', '10')\n",
+ "(1, '202341831114R02C01', 'Д', 'Хр', '10_35026092', '[T/C]', '10')\n",
+ "(1, '202341831114R02C01', 'Д', 'Хр', '10_35986760', '[T/C]', '10')\n"
+ ]
+ }
+ ],
+ "source": [
+ "query = '''SELECT metadata.Id, metadata.dna_chip_id, \n",
+ " metadata.breed, metadata.sex, \n",
+ " genstudio.SNP_name, genstudio.SNP, \n",
+ " genstudio.Chr\n",
+ " FROM metadata, genstudio\n",
+ " WHERE metadata.dna_chip_id = genstudio.dna_chip_id \n",
+ " '''\n",
+ "\n",
+ "rows = connection.execute(query).fetchmany(20)\n",
+ "for row in rows:\n",
+ " print(row)"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.7.6"
+ },
+ "varInspector": {
+ "cols": {
+ "lenName": 16,
+ "lenType": 16,
+ "lenVar": 40
+ },
+ "kernels_config": {
+ "python": {
+ "delete_cmd_postfix": "",
+ "delete_cmd_prefix": "del ",
+ "library": "var_list.py",
+ "varRefreshCmd": "print(var_dic_list())"
+ },
+ "r": {
+ "delete_cmd_postfix": ") ",
+ "delete_cmd_prefix": "rm(",
+ "library": "var_list.r",
+ "varRefreshCmd": "cat(var_dic_list()) "
+ }
+ },
+ "types_to_exclude": [
+ "module",
+ "function",
+ "builtin_function_or_method",
+ "instance",
+ "_Feature"
+ ],
+ "window_display": false
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/hw11/databases.py b/hw11/databases.py
new file mode 100644
index 0000000..06ab075
--- /dev/null
+++ b/hw11/databases.py
@@ -0,0 +1,114 @@
+import pandas as pd
+import sqlite3
+from IPython.display import display
+
+
+# Look at our dataframes
+
+df_metadata = pd.read_csv('../data/metadata.csv')
+df_genstudio = pd.read_csv('../data/genstudio.csv')
+print('metadata shape:', df_metadata.shape)
+print('genstudio shape:', df_genstudio.shape)
+display(df_metadata.head(), df_genstudio.head())
+
+# Column `Sample ID` contains the same values as `dna_chip_id` => we will use them to connect two tables
+
+all(df_genstudio['Sample ID'].isin(df_metadata['dna_chip_id']))
+
+# Create two databases
+
+df_metadata.info()
+df_genstudio.info()
+connection = sqlite3.connect('../data/my_database.db')
+create_metadata = '''CREATE TABLE metadata(
+ Id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT,
+ dna_chip_id TEXT,
+ breed TEXT,
+ sex TEXT)
+ '''
+
+create_genstudio = '''CREATE TABLE genstudio(
+ Id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT,
+ SNP_name TEXT,
+ SNP_index INTEGER,
+ SNP_Aux INTEGER,
+ dna_chip_id TEXT,
+ SNP TEXT,
+ Allele1_Top TEXT,
+ Allele2_Top TEXT,
+ Allele1_Forward TEXT,
+ Allele2_Forward TEXT,
+ Allele1_AB TEXT,
+ Allele2_AB TEXT,
+ Chr TEXT,
+ Position TEXT,
+ GC_Score NUMERIC,
+ GT_Score NUMERIC,
+ Theta NUMERIC,
+ R NUMERIC,
+ B_Allele_Freq NUMERIC,
+ Log_R_Ration NUMERIC,
+ FOREIGN KEY (dna_chip_id) REFERENCES metadata(dna_chip_id))
+ '''
+connection.execute(create_metadata)
+connection.execute(create_genstudio)
+
+
+# Fill tables with values
+
+insertion_metadata = '''INSERT INTO metadata(Id,
+ dna_chip_id,
+ breed,
+ sex)
+ VALUES(?,?,?,?)'''
+
+insertion_genstudio = '''INSERT INTO genstudio(Id,
+ SNP_name,
+ SNP_index,
+ SNP_Aux,
+ dna_chip_id,
+ SNP,
+ Allele1_Top,
+ Allele2_Top,
+ Allele1_Forward,
+ Allele2_Forward,
+ Allele1_AB,
+ Allele2_AB,
+ Chr,
+ Position,
+ GC_Score,
+ GT_Score,
+ Theta,
+ R,
+ B_Allele_Freq,
+ Log_R_Ration)
+ VALUES(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)'''
+
+
+connection.executemany(insertion_metadata, df_metadata.values)
+connection.executemany(insertion_genstudio, df_genstudio.values)
+connection.commit()
+connection.close()
+
+
+# Test some commands
+
+connection = sqlite3.connect('../data/my_database.db')
+
+query = '''SELECT Id, dna_chip_id, SNP_name, SNP FROM genstudio
+ WHERE genstudio.GC_score > 0.5'''
+rows = connection.execute(query).fetchmany(15)
+for row in rows:
+ print(row)
+
+query = '''SELECT metadata.Id, metadata.dna_chip_id,
+ metadata.breed, metadata.sex,
+ genstudio.SNP_name, genstudio.SNP,
+ genstudio.Chr
+ FROM metadata, genstudio
+ WHERE metadata.dna_chip_id = genstudio.dna_chip_id
+ '''
+
+rows = connection.execute(query).fetchmany(20)
+for row in rows:
+ print(row)
diff --git a/hw11/requirements.txt b/hw11/requirements.txt
new file mode 100644
index 0000000..7e7e669
--- /dev/null
+++ b/hw11/requirements.txt
@@ -0,0 +1,2 @@
+pandas==1.0.1
+ipython==7.12.0