Skip to content

Commit 0ec7201

Browse files
drammockbambooforest
authored andcommitted
MRG: Autobuild features (#198)
* WIP: auto-build features * WIP: autobuild working * ENH: add SegmentClass * FIX: diphthong conventions; remove blank lines * minor code cleanup * WIP: handle contextual diacritics * regen data * add new data file (w/ features) * nɖʐ -> ɳɖʐ * ɲd̠ʒ -> n̠d̠ʒ * finish feature builder * regen data * re-gen data after rebase (oops) * ᵊ -> ə * feats for unreleased diacritic * fix: feat assignment for centralized / midcentralized diacritics * cast InventoryID as integer instead of double * doc: clean up comments * change ouput varnames and filenames * bugfixes, comment cleanup * regen everything * rename LanguageCode -> ISO6393 and keep in output * feature assignment bugfixes * regen data
1 parent 8a09551 commit 0ec7201

18 files changed

Lines changed: 106596 additions & 106094 deletions

aggregate-raw-data.R

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,8 @@ debug <- FALSE
1515
data_dir <- file.path("raw-data")
1616
output_dir <- file.path("data")
1717
glotto_path <- file.path("mappings", "InventoryID-LanguageCodes.csv")
18-
output_path <- file.path(output_dir, "phoible-by-phoneme.csv")
19-
output_path_rdata <- file.path(output_dir, "phoible-by-phoneme.RData")
18+
output_path <- file.path(output_dir, "phoible-nofeats.csv")
19+
output_path_rdata <- file.path(output_dir, "phoible-nofeats.RData")
2020
if (!dir.exists(output_dir)) dir.create(output_dir, mode="0755")
2121

2222
## LOAD EXTERNAL FUNCTIONS
@@ -232,30 +232,30 @@ if (!debug) rm(saphon_raw, saphon_ipa)
232232
## ## ## ## ## ## ## ## ##
233233

234234
## combine into one data frame
235-
data_sources_list <- list(ph_data, aa_data, spa_data, upsid_data,
236-
ra_data, gm_data, saphon_data, uz_data, ea_data, er_data)
235+
data_sources_list <- list(ph_data, aa_data, spa_data, upsid_data, ra_data,
236+
gm_data, saphon_data, uz_data, ea_data, er_data)
237237
all_data <- do.call(rbind, data_sources_list)
238-
all_data <- all_data[with(all_data, order(LanguageCode, Source, InventoryID)),]
238+
all_data <- all_data[order(all_data$InventoryID),]
239239

240240
## MERGE IN GLOTTOLOG CODES
241241
glotto_mapping <- read.csv(glotto_path)
242-
glotto_mapping <- glotto_mapping[c("InventoryID", "Glottocode")]
242+
glotto_mapping <- glotto_mapping[c("InventoryID", "Glottocode", "ISO6393")]
243243
all_data <- merge(all_data, glotto_mapping, all.x=TRUE)
244244

245245
## ADD GLYPH IDs
246246
all_data$GlyphID <- get_codepoints(all_data$Phoneme)
247247

248248
## CONVERT INVENTORY ID TO INTEGER
249-
all_data$InventoryID <- as.numeric(all_data$InventoryID)
249+
all_data$InventoryID <- as.integer(all_data$InventoryID)
250250

251251
## SAVE
252-
output_fields <- c("InventoryID", "Glottocode", "LanguageCode", "LanguageName",
252+
output_fields <- c("InventoryID", "Glottocode", "ISO6393", "LanguageName",
253253
"SpecificDialect", "GlyphID", "Phoneme", "Allophones",
254-
"Marginal", "Source")
255-
phoible <- all_data[output_fields]
256-
write.csv(phoible, file=output_path, row.names=FALSE, quote=TRUE, eol="\n",
257-
fileEncoding="UTF-8")
258-
save(phoible, file=output_path_rdata)
254+
"Marginal", "SegmentClass", "Source")
255+
phoible_nofeats <- all_data[output_fields]
256+
write.csv(phoible_nofeats, file=output_path, row.names=FALSE, quote=TRUE,
257+
eol="\n", fileEncoding="UTF-8")
258+
save(phoible_nofeats, file=output_path_rdata)
259259
## WRITE LOG FILE
260260
if(exists("unfamiliar_glyphs")) {
261261
log_path <- file.path(output_dir, "unfamiliar-glyphs.csv")

data/phoible-by-phoneme.RData

-465 KB
Binary file not shown.

data/phoible-by-phoneme.csv

Lines changed: 0 additions & 105482 deletions
This file was deleted.

data/phoible.RData

1.44 MB
Binary file not shown.

data/phoible.csv

Lines changed: 105482 additions & 0 deletions
Large diffs are not rendered by default.

mappings/InventoryID-LanguageCodes.csv

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
"InventoryID","LanguageCode","Glottocode","LanguageName","Source"
1+
"InventoryID","ISO6393","Glottocode","LanguageName","Source"
22
1,"kor","kore1280","Korean","spa"
33
2,"ket","kett1243","Ket","spa"
44
3,"lbe","lakk1252","Lak","spa"

raw-data/EA/EA_IPA_correspondences.tsv

Lines changed: 62 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -82,8 +82,8 @@ aɪ aɪ NA TRUE
8282
aɪ̃ aɪ̃ NA FALSE
8383
a̰ɪ̰ a̰ɪ̰ NA FALSE
8484
ãɪ̯ː ãɪ̯ː NA FALSE
85-
aj aj NA TRUE
86-
ãj ãj NA FALSE
85+
aj ai̯ conventionalized TRUE
86+
ãj ãi̯ conventionalized FALSE
8787
ao ao NA TRUE
8888
ãõ ãõ NA FALSE
8989
aɵ̯ aɵ̯ NA FALSE
@@ -97,9 +97,9 @@ a̰ɯ̰ a̰ɯ̰ NA FALSE
9797
aɯ guess TRUE
9898
aʊ NA TRUE
9999
aʊ̃ aʊ̃ NA FALSE
100-
aw aw NA TRUE
101-
ãw ãw NA FALSE
102-
ãw̃ ãw̃ NA FALSE
100+
aw au̯ conventionalized TRUE
101+
ãw ãu̯ conventionalized FALSE
102+
ãw̃ ãũ̯ conventionalized FALSE
103103
aˤ NA TRUE
104104
aˤː aːˤ NA TRUE
105105
ɐ ɐ NA TRUE
@@ -280,8 +280,8 @@ eɪ eɪ NA TRUE
280280
eɪ̃ eɪ̃ NA FALSE
281281
ẽɪ̃ ẽɪ̃ NA FALSE
282282
ḛɪ̰ ḛɪ̰ NA FALSE
283-
ej ej NA TRUE
284-
ẽj ẽj NA FALSE
283+
ej ei̯ conventionalized TRUE
284+
ẽj ẽi̯ conventionalized FALSE
285285
eo eo NA TRUE
286286
eu eu NA TRUE
287287
eu̯ eu̯ NA TRUE
@@ -305,7 +305,7 @@ ew ue guess TRUE
305305
əi̯ əi̯ NA FALSE
306306
ə̃ĩ ə̃ĩ NA FALSE
307307
əɨ əɨ NA FALSE
308-
əj əj NA FALSE
308+
əj əi̯ conventionalized FALSE
309309
əo əo NA FALSE
310310
əu əu NA TRUE
311311
əũ əũ NA FALSE
@@ -314,7 +314,7 @@ ew ue guess TRUE
314314
əɯ̃ əɯ̃ NA FALSE
315315
ə̰ɯ̰ ə̰ɯ̰ NA FALSE
316316
əʊ əʊ NA FALSE
317-
əw əw NA FALSE
317+
əw əu̯ conventionalized FALSE
318318
əy əy NA TRUE
319319
əˤ əˤ new segment type added FALSE
320320
ɛ ɛ NA TRUE
@@ -358,7 +358,7 @@ ew ue guess TRUE
358358
ɜː ɜː NA TRUE
359359
ɜ̹ː ɜ̹ː NA FALSE
360360
ɜi ɜi NA TRUE
361-
ɜj ɜj NA FALSE
361+
ɜj ɜi̯ conventionalized FALSE
362362
ɞ ɞ NA TRUE
363363
ɞ̜ ɞ̜ NA FALSE
364364
ɞ̠ ɞ̠ NA FALSE
@@ -525,7 +525,7 @@ iɜ iɜ NA FALSE
525525
ii ii NA FALSE
526526
ii̯ ii̯ NA FALSE
527527
iɪ̯ iɪ̯ NA FALSE
528-
ij ij NA FALSE
528+
ij ii̯ conventionalized FALSE
529529
io io NA TRUE
530530
iø NA FALSE
531531
ĩõ ĩõ NA FALSE
@@ -577,33 +577,33 @@ iˤæ iˤæ NA FALSE
577577
ɨɛ ɨɛ NA FALSE
578578
ɨi ɨi NA TRUE
579579
ɨi̯ ɨi̯ NA FALSE
580-
ɨj ɨj NA FALSE
580+
ɨj ɨi̯ conventionalized FALSE
581581
j j NA TRUE
582582
j̊ NA FALSE
583583
j̃ NA TRUE
584584
jː NA TRUE
585-
ja ja NA TRUE
586-
jaː jaː NA FALSE
587-
jæ NA FALSE
588-
jæi jæi NA FALSE
589-
jai jai NA FALSE
590-
jau jau NA FALSE
591-
jau̯ jau̯ NA TRUE
592-
je je NA TRUE
593-
jẽ jẽ NA FALSE
594-
jei̯ jei̯ NA FALSE
595-
jɛ NA TRUE
596-
jɛ̃ jɛ̃ NA FALSE
597-
jɛi jɛi NA FALSE
598-
jɛi̯ jɛi̯ NA FALSE
599-
jo jo NA TRUE
600-
joː joː NA FALSE
601-
joi joi NA FALSE
602-
jɔ NA TRUE
603-
ju ju NA TRUE
604-
juː juː NA FALSE
605-
juaː juaː NA FALSE
606-
jui jui NA FALSE
585+
ja i̯a conventionalized TRUE
586+
jaː i̯aː conventionalized FALSE
587+
i̯æ conventionalized FALSE
588+
jæi i̯æi conventionalized FALSE
589+
jai i̯ai conventionalized FALSE
590+
jau i̯au conventionalized FALSE
591+
jau̯ i̯au̯ conventionalized TRUE
592+
je i̯e conventionalized TRUE
593+
jẽ i̯ẽ conventionalized FALSE
594+
jei̯ i̯ei̯ conventionalized FALSE
595+
i̯ɛ conventionalized TRUE
596+
jɛ̃ i̯ɛ̃ conventionalized FALSE
597+
jɛi i̯ɛi conventionalized FALSE
598+
jɛi̯ i̯ɛi̯ conventionalized FALSE
599+
jo i̯o conventionalized TRUE
600+
joː i̯oː conventionalized FALSE
601+
joi i̯oi conventionalized FALSE
602+
i̯ɔ conventionalized TRUE
603+
ju i̯u conventionalized TRUE
604+
juː i̯uː conventionalized FALSE
605+
juaː i̯uaː conventionalized FALSE
606+
jui i̯ui conventionalized FALSE
607607
ʝ ʝ NA TRUE
608608
ʝː ʝː NA FALSE
609609
ɟ ɟ NA TRUE
@@ -833,23 +833,23 @@ oe oe NA TRUE
833833
œ̃ɛ̃ œ̃ɛ̃ NA FALSE
834834
œ̞ɛ̞ œ̞ɛ̞ new segment type added; TODO: investigate FALSE
835835
œi œi NA TRUE
836-
œj œj NA FALSE
836+
œj œi̯ conventionalized FALSE
837837
œy œy NA TRUE
838838
oə NA FALSE
839839
øə øə NA FALSE
840840
oi oi NA TRUE
841841
oi̯ oi̯ NA TRUE
842842
o̞i̯ o̞i̯ NA FALSE
843843
øɪ̯ øɪ̯ NA FALSE
844-
oj oj NA TRUE
845-
õj õj NA FALSE
844+
oj oi̯ conventionalized TRUE
845+
õj õi̯ conventionalized FALSE
846846
oɔ NA FALSE
847847
ou ou NA TRUE
848848
ou̯ ou̯ NA TRUE
849849
øu øu NA FALSE
850850
oʊ NA TRUE
851851
oʊ̃ oʊ̃ NA FALSE
852-
ow ow NA TRUE
852+
ow ou̯ conventionalized TRUE
853853
øy øy NA TRUE
854854
øy̯ øy̯ NA FALSE
855855
oˤ NA TRUE
@@ -889,7 +889,7 @@ oˤː oːˤ NA FALSE
889889
ɔʊ ɔʊ NA FALSE
890890
ɔʊ̯ ɔʊ̯ NA FALSE
891891
ɔ̃ʊ̯̃ ɔ̃ʊ̯̃ NA FALSE
892-
ɔw ɔw NA FALSE
892+
ɔw ɔu̯ conventionalized FALSE
893893
ɔy̯ː ɔy̯ː NA FALSE
894894
ɔʏ ɔʏ NA FALSE
895895
ɔˤː ɔːˤ NA FALSE
@@ -1238,7 +1238,7 @@ uɜ uɜ NA FALSE
12381238
ui ui NA TRUE
12391239
ui̯ ui̯ NA TRUE
12401240
ũĩ ũĩ NA TRUE
1241-
uj uj NA TRUE
1241+
uj ui̯ conventionalized TRUE
12421242
uo uo NA TRUE
12431243
uo̞ uo̞ NA FALSE
12441244
ũo ũo NA FALSE
@@ -1333,35 +1333,35 @@ vˤ vˤ NA FALSE
13331333
w w NA TRUE
13341334
w̜ NA FALSE
13351335
w̥ NA TRUE
1336-
wa wa NA TRUE
1337-
wã wã NA FALSE
1338-
waː waː NA FALSE
1339-
wæi wæi NA FALSE
1340-
wai wai NA FALSE
1336+
wa u̯a conventionalized TRUE
1337+
wã u̯ã conventionalized FALSE
1338+
waː u̯aː conventionalized FALSE
1339+
wæi u̯æi conventionalized FALSE
1340+
wai u̯ai conventionalized FALSE
13411341
ʷd ʷd NA FALSE
1342-
we we NA TRUE
1343-
wei̯ wei̯ NA TRUE
1344-
weu̯ weu̯ NA FALSE
1345-
wə NA TRUE
1346-
wə̃ wə̃ NA FALSE
1347-
NA FALSE
1348-
wɛ̃ wɛ̃ NA FALSE
1349-
wɛi wɛi NA FALSE
1350-
wɛi̯ wɛi̯ NA FALSE
1342+
we u̯e conventionalized TRUE
1343+
wei̯ u̯ei̯ conventionalized TRUE
1344+
weu̯ u̯eu̯ conventionalized FALSE
1345+
u̯ə conventionalized TRUE
1346+
wə̃ u̯ə̃ conventionalized FALSE
1347+
u̯ɛ NA FALSE
1348+
wɛ̃ u̯ɛ̃ conventionalized FALSE
1349+
wɛi u̯ɛi conventionalized FALSE
1350+
wɛi̯ u̯ɛi̯ conventionalized FALSE
13511351
wʰ NA FALSE
1352-
wi wi NA TRUE
1352+
wi u̯i NA TRUE
13531353
wʲ NA TRUE
13541354
w̜ʲ w̜ʲ NA FALSE
13551355
ʷɟ ʷɟ NA FALSE
13561356
ʷl ʷl NA FALSE
13571357
ʷm ʷm NA FALSE
1358-
wo wo NA TRUE
1359-
wõ wõ NA FALSE
1360-
woː woː NA FALSE
1361-
wõː wõː NA FALSE
1362-
wɔ NA FALSE
1363-
wɔ̃ wɔ̃ NA FALSE
1364-
wɔː wɔː NA FALSE
1358+
wo u̯o conventionalized TRUE
1359+
wõ u̯õ conventionalized FALSE
1360+
woː u̯oː conventionalized FALSE
1361+
wõː u̯õː conventionalized FALSE
1362+
u̯ɔ conventionalized FALSE
1363+
wɔ̃ u̯ɔ̃ conventionalized FALSE
1364+
wɔː u̯ɔː conventionalized FALSE
13651365
ʷr ʷr NA FALSE
13661366
ʷz ʷz NA FALSE
13671367
ʷʐ ʷʐ NA FALSE

0 commit comments

Comments
 (0)