Skip to content

Commit cc4e34b

Browse files
authored
Define supported column names (#116)
1 parent 6f992d9 commit cc4e34b

1 file changed

Lines changed: 42 additions & 26 deletions

File tree

src/PreprocessMD.jl

Lines changed: 42 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -17,17 +17,20 @@ using Tables: materializer
1717

1818
export add_label_column!, pivot, subsetMD, top_n_values
1919

20+
COLUMN_TYPES = Union{String, Symbol}
21+
OPTIONAL_COLUMN_TYPES = Union{COLUMN_TYPES, Nothing}
22+
2023
"""
21-
function add_label_column!(to_df, from_df, new_col_name[, id])::Nothing
24+
function add_label_column!(feature_df, source_df, new_column[, id])
2225
2326
Add column to a DataFrame based on symbol presence in the target DataFrame
2427
2528
# Arguments
2629
- `
27-
- `to_df::AbstractDataFrame`: feature DataFrame to which label column is added
28-
- `from_df::AbstractDataFrame`: DataFrame containing the label column
29-
- `new_col_name`: name assigned to label column
30-
- `id=nothing`: row IDs (Default: first column)
30+
- `feature_df::AbstractDataFrame`: feature DataFrame to which label column is added
31+
- `source_df::AbstractDataFrame`: DataFrame containing the label column
32+
- `new_column::Union{String, Symbol}`: name assigned to label column
33+
- `id::Union{Nothing, String, Symbol}`: row IDs (Default: first column)
3134
3235
# Examples
3336
```jldoctest
@@ -56,10 +59,15 @@ X
5659
5760
```
5861
"""
59-
function add_label_column!(to_df::AbstractDataFrame, from_df::AbstractDataFrame, new_col_name, id=nothing)::Nothing
62+
function add_label_column!(
63+
feature_df::AbstractDataFrame,
64+
source_df::AbstractDataFrame,
65+
new_column::COLUMN_TYPES,
66+
id::OPTIONAL_COLUMN_TYPES=nothing,
67+
)::Nothing
6068

6169
# Error checks
62-
for arg in [to_df, from_df]
70+
for arg in [feature_df, source_df]
6371
if size(arg)[1] < 1
6472
#@warn "DataFrame must have at least 1 row"
6573
throw(DomainError(arg))
@@ -72,31 +80,30 @@ function add_label_column!(to_df::AbstractDataFrame, from_df::AbstractDataFrame,
7280

7381
# Assign missing arguments
7482
if isnothing(id)
75-
id = names(to_df)[1]
83+
id = names(feature_df)[1]
7684
end
7785

7886
# Add column
79-
#insertcols!(to_df, new_col_name => [x[id] in from_df[!,id] for x in eachrow(to_df)])
80-
insertcols!(to_df, new_col_name => map(x -> x in from_df[!, id], to_df[!, id]))
87+
#insertcols!(feature_df, new_column => [x[id] in source_df[!,id] for x in eachrow(feature_df)])
88+
insertcols!(feature_df, new_column => map(x -> x in source_df[!, id], feature_df[!, id]))
8189

82-
coerce!(to_df, new_col_name => OrderedFactor{2})
90+
coerce!(feature_df, new_column => OrderedFactor{2})
8391
return nothing
8492
end
85-
function add_label_column!(to_table, from_table, id=nothing, new_col_name=nothing
86-
)::Nothing
87-
assert_is_table(to_table)
88-
assert_is_table(from_table)
93+
function add_label_column!(feature_table::Any, source_table::Any, id::OPTIONAL_COLUMN_TYPES=nothing, new_column::OPTIONAL_COLUMN_TYPES=nothing)::Nothing
94+
assert_is_table(feature_table)
95+
assert_is_table(source_table)
8996

90-
to_df = DataFrame(to_table)::DataFrame
91-
from_df = DataFrame(to_table)::DataFrame
97+
feature_df = DataFrame(feature_table)::DataFrame
98+
source_df = DataFrame(feature_table)::DataFrame
9299

93-
to_df::DataFrame
94-
from_df::DataFrame
100+
feature_df::DataFrame
101+
source_df::DataFrame
95102

96-
return add_label_column!(to_df, from_df, id, new_col_name)
103+
return add_label_column!(feature_df, source_df, id, new_column)
97104
end
98105

99-
function assert_is_table(x)
106+
function assert_is_table(x::Any)::Nothing
100107
if !istable(x)
101108
msg = "Input must be a table, but $(typeof(x)) is not a table"
102109
throw(ArgumentError(msg))
@@ -131,7 +138,11 @@ pivot(df)
131138
132139
```
133140
"""
134-
function pivot(df::AbstractDataFrame, newcols=nothing, y=nothing)::AbstractDataFrame
141+
function pivot(
142+
df::AbstractDataFrame,
143+
newcols::OPTIONAL_COLUMN_TYPES=nothing,
144+
y::OPTIONAL_COLUMN_TYPES=nothing,
145+
)::AbstractDataFrame
135146

136147
# Error checks
137148
if size(df)[1] < 1
@@ -169,7 +180,7 @@ function pivot(df::AbstractDataFrame, newcols=nothing, y=nothing)::AbstractDataF
169180
end
170181
return B
171182
end
172-
function pivot(obj)
183+
function pivot(obj::Any)::Any
173184
assert_is_table(obj)
174185
df = DataFrame(obj)::DataFrame
175186
df::DataFrame
@@ -238,7 +249,12 @@ subsetMD(X,Y)
238249
239250
```
240251
"""
241-
function subsetMD(main_df::AbstractDataFrame, check_df::AbstractDataFrame, main_id=nothing, check_id=nothing)::AbstractDataFrame
252+
function subsetMD(
253+
main_df::AbstractDataFrame,
254+
check_df::AbstractDataFrame,
255+
main_id::OPTIONAL_COLUMN_TYPES=nothing,
256+
check_id::OPTIONAL_COLUMN_TYPES=nothing,
257+
)::AbstractDataFrame
242258

243259
# Assign missing arguments
244260
if isnothing(main_id)
@@ -257,11 +273,11 @@ end
257273
=#
258274

259275
"""
260-
function top_n_values(df::AbstractDataFrame, col, n::Int)::AbstractDataFrame
276+
function top_n_values(df::AbstractDataFrame, col::Union{String, Symbol}, n::Int)::AbstractDataFrame
261277
Find top n values by occurence
262278
Useful for initial feasibility checks, but medical codes are not considered
263279
"""
264-
function top_n_values(df::AbstractDataFrame, col, n::Int)::AbstractDataFrame
280+
function top_n_values(df::AbstractDataFrame, col::COLUMN_TYPES, n::Int)::AbstractDataFrame
265281
return first(sort(combine(nrow, groupby(df, col)), "nrow"; rev=true), n)
266282
end
267283

0 commit comments

Comments
 (0)