Skip to content

Commit 990a8bc

Browse files
committed
ENH: Add support for standard sql datatypes
1 parent 0474693 commit 990a8bc

4 files changed

Lines changed: 133 additions & 12 deletions

File tree

dataframe_sql/grammar/sql.grammar

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,22 @@ column_name: [NAME "."] NAME
6363

6464

6565
SELECT_CONSTRAINT.9: "ALL"i | "DISTINCT"i
66-
TYPENAME: "object"i | "int64"i | "float64"i | "bool"i | "datetime64"i | "timedelta[ns]"i | "category"i
66+
TYPENAME: "object"i
67+
| "varchar"i
68+
| "int16"i
69+
| "smallint"i
70+
| "int32"i
71+
| "int64"i
72+
| "int"i
73+
| "bigint"i
74+
| "float16"i
75+
| "float32"i
76+
| "float64"i
77+
| "float"i
78+
| "bool"i
79+
| "datetime64"i
80+
| "timestamp"i
81+
| "category"i
6782
?aggregation: NAME -> aggregation_name
6883
alias: NAME -> alias_string
6984
_window_name: NAME

dataframe_sql/parsing/sql_parser.py

Lines changed: 36 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -42,12 +42,34 @@
4242
)
4343
PANDAS_TYPE_PYTHON_TYPE_FUNCTION = {
4444
"object": str,
45+
"string": str,
46+
"int16": int,
47+
"int32": int,
4548
"int64": int,
49+
"float16": float,
50+
"float32": float,
4651
"float64": float,
4752
"bool": bool,
4853
}
54+
55+
TYPE_TO_PANDAS_TYPE = {
56+
"varchar": "string",
57+
"smallint": "int16",
58+
"int": "int32",
59+
"bigint": "int64",
60+
"float": "float64",
61+
"timestamp": "datetime64",
62+
"datetime64": "datetime64",
63+
"timedelta[ns]": "timedelta[ns]",
64+
"category": "category",
65+
}
66+
67+
for TYPE in PANDAS_TYPE_PYTHON_TYPE_FUNCTION:
68+
TYPE_TO_PANDAS_TYPE[TYPE] = TYPE
69+
4970
PANDAS_TYPE_TO_SQL_TYPE = {
5071
"object": String,
72+
"string": String,
5173
"int64": Number,
5274
"float64": Number,
5375
"bool": Bool,
@@ -748,8 +770,6 @@ def rank(self, tokens, rank_function):
748770
:param rank_function: Function to be used in rank evaluation
749771
:return:
750772
"""
751-
print("yes")
752-
753773
expressions = tokens[0]
754774
series_list = []
755775
order_list = []
@@ -846,7 +866,7 @@ def as_type(self, column_and_type):
846866
"""
847867
column = column_and_type[0]
848868
typename = column_and_type[1]
849-
column.typename = typename.value
869+
column.typename = TYPE_TO_PANDAS_TYPE[typename.value]
850870
return column
851871

852872
def literal_cast(self, value_and_type: list):
@@ -1218,7 +1238,10 @@ def handle_non_token_non_tree(query_info: QueryInfo, token, token_pos):
12181238
if isinstance(token, Column):
12191239
query_info.columns.append(token)
12201240
query_info.column_selected[token.name] = True
1241+
# TODO Get rid of collecting this alias information since its part of the
1242+
# column object
12211243
if token.alias:
1244+
print(query_info.aliases)
12221245
query_info.aliases[token.name] = token.alias
12231246

12241247
if isinstance(token, Expression):
@@ -1398,6 +1421,7 @@ def handle_columns(
13981421
:param internal_transformer: Transformer to transform the where clauses
13991422
:return:
14001423
"""
1424+
print(columns)
14011425
where_value = None
14021426
where_plan = ":"
14031427
if where_expr is not None:
@@ -1415,6 +1439,7 @@ def handle_columns(
14151439
new_frame = first_frame.copy()
14161440
else:
14171441
column_names = []
1442+
final_names = []
14181443
for column in columns:
14191444
true_column_name = self.column_name_map[column.table][
14201445
column.name.lower()
@@ -1426,13 +1451,20 @@ def handle_columns(
14261451
):
14271452
aliases[true_column_name] = column.name
14281453

1454+
if column.alias:
1455+
final_names.append(column.alias)
1456+
else:
1457+
final_names.append(column.name)
1458+
1459+
print(final_names)
14291460
if where_value is not None:
14301461
new_frame = first_frame.loc[where_value, column_names]
14311462
else:
14321463
new_frame = first_frame.loc[:, column_names]
14331464
execution_plan += f".loc[{where_plan}, {column_names}]"
14341465
if aliases:
1435-
new_frame = new_frame.rename(columns=aliases)
1466+
# new_frame = new_frame.rename(columns=aliases)
1467+
new_frame.columns = final_names
14361468
execution_plan += f".rename(columns={aliases})"
14371469

14381470
return new_frame, execution_plan

dataframe_sql/tests/pandas_sql_functionality_test.py

Lines changed: 75 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
from dataframe_sql.sql_objects import AmbiguousColumn
2222
from dataframe_sql.sql_select_query import TableInfo
2323
from dataframe_sql.tests.utils import (
24+
AVOCADO,
2425
DIGIMON_MON_LIST,
2526
DIGIMON_MOVE_LIST,
2627
FOREST_FIRES,
@@ -1320,9 +1321,82 @@ def test_case_statement_with_same_conditions():
13201321
tm.assert_frame_equal(pandas_frame, my_frame)
13211322

13221323

1324+
@assert_state_not_change
1325+
def test_multiple_aliases_same_column():
1326+
"""
1327+
Test multiple aliases on the same column
1328+
:return:
1329+
"""
1330+
my_frame = query(
1331+
"""
1332+
select wind as my_wind, wind as also_the_wind, wind as yes_wind
1333+
from
1334+
forest_fires
1335+
"""
1336+
)
1337+
1338+
pandas_frame = FOREST_FIRES[["wind"]].copy()
1339+
pandas_frame.loc[:, "my_wind"] = FOREST_FIRES["wind"].copy()
1340+
pandas_frame.loc[:, "also_the_wind"] = FOREST_FIRES["wind"]
1341+
pandas_frame.loc[:, "yes_wind"] = FOREST_FIRES["wind"]
1342+
pandas_frame = pandas_frame.drop(columns=["wind"])
1343+
tm.assert_frame_equal(pandas_frame, my_frame)
1344+
1345+
1346+
@assert_state_not_change
1347+
def test_sql_data_types():
1348+
"""
1349+
Tests sql data types
1350+
:return:
1351+
"""
1352+
my_frame = query(
1353+
"""
1354+
select
1355+
cast(avocado_id as object) as avocado_id_object,
1356+
cast(avocado_id as int16) as avocado_id_int16,
1357+
cast(avocado_id as smallint) as avocado_id_smallint,
1358+
cast(avocado_id as int32) as avocado_id_int32,
1359+
cast(avocado_id as int) as avocado_id_int,
1360+
cast(avocado_id as int64) as avocado_id_int64,
1361+
cast(avocado_id as bigint) as avocado_id_bigint,
1362+
cast(avocado_id as float) as avocado_id_float,
1363+
cast(avocado_id as float16) as avocado_id_float16,
1364+
cast(avocado_id as float32) as avocado_id_float32,
1365+
cast(avocado_id as float64) as avocado_id_float64,
1366+
cast(avocado_id as bool) as avocado_id_bool,
1367+
cast(avocado_id as category) as avocado_id_category,
1368+
cast(date as datetime64) as date,
1369+
cast(date as timestamp) as time,
1370+
cast(region as varchar) as region_varchar
1371+
from avocado
1372+
"""
1373+
)
1374+
1375+
pandas_frame = AVOCADO.copy()[["avocado_id", "Date", "region"]]
1376+
pandas_frame["avocado_id_object"] = pandas_frame["avocado_id"].astype("object")
1377+
pandas_frame["avocado_id_int16"] = pandas_frame["avocado_id"].astype("int16")
1378+
pandas_frame["avocado_id_smallint"] = pandas_frame["avocado_id"].astype("int16")
1379+
pandas_frame["avocado_id_int32"] = pandas_frame["avocado_id"].astype("int32")
1380+
pandas_frame["avocado_id_int"] = pandas_frame["avocado_id"].astype("int32")
1381+
pandas_frame["avocado_id_int64"] = pandas_frame["avocado_id"].astype("int64")
1382+
pandas_frame["avocado_id_bigint"] = pandas_frame["avocado_id"].astype("int64")
1383+
pandas_frame["avocado_id_float"] = pandas_frame["avocado_id"].astype("float")
1384+
pandas_frame["avocado_id_float16"] = pandas_frame["avocado_id"].astype("float16")
1385+
pandas_frame["avocado_id_float32"] = pandas_frame["avocado_id"].astype("float32")
1386+
pandas_frame["avocado_id_float64"] = pandas_frame["avocado_id"].astype("float64")
1387+
pandas_frame["avocado_id_bool"] = pandas_frame["avocado_id"].astype("bool")
1388+
pandas_frame["avocado_id_category"] = pandas_frame["avocado_id"].astype("category")
1389+
pandas_frame["date"] = pandas_frame["Date"].astype("datetime64")
1390+
pandas_frame["time"] = pandas_frame["Date"].astype("datetime64")
1391+
pandas_frame["region_varchar"] = pandas_frame["region"].astype("string")
1392+
pandas_frame = pandas_frame.drop(columns=["avocado_id", "Date", "region"])
1393+
1394+
tm.assert_frame_equal(pandas_frame, my_frame)
1395+
1396+
13231397
if __name__ == "__main__":
13241398
register_env_tables()
13251399

1326-
test_in_operator()
1400+
test_sql_data_types()
13271401

13281402
remove_env_tables()

dataframe_sql/tests/utils.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,21 @@
11
"""
22
Shared functions among the tests like setting up test environment
33
"""
4-
import os
54
from pathlib import Path
65

76
from pandas import DataFrame, read_csv
87

98
from dataframe_sql import register_temp_table, remove_temp_table
109

11-
DATA_PATH = os.path.join(Path(__file__).parent.parent, "data")
10+
DATA_PATH = Path(__file__).parent.parent / "data"
1211

1312

1413
# Import the data for testing
15-
FOREST_FIRES = read_csv(os.path.join(DATA_PATH, "forestfires.csv"))
16-
DIGIMON_MON_LIST = read_csv(os.path.join(DATA_PATH, "DigiDB_digimonlist.csv"))
17-
DIGIMON_MOVE_LIST = read_csv(os.path.join(DATA_PATH, "DigiDB_movelist.csv"))
18-
DIGIMON_SUPPORT_LIST = read_csv(os.path.join(DATA_PATH, "DigiDB_supportlist.csv"))
14+
FOREST_FIRES = read_csv(DATA_PATH / "forestfires.csv")
15+
DIGIMON_MON_LIST = read_csv(DATA_PATH / "DigiDB_digimonlist.csv")
16+
DIGIMON_MOVE_LIST = read_csv(DATA_PATH / "DigiDB_movelist.csv")
17+
DIGIMON_SUPPORT_LIST = read_csv(DATA_PATH / "DigiDB_supportlist.csv")
18+
AVOCADO = read_csv(DATA_PATH / "avocado.csv")
1919

2020
# Name change is for name interference
2121
DIGIMON_MON_LIST["mon_attribute"] = DIGIMON_MON_LIST["Attribute"]

0 commit comments

Comments
 (0)