Skip to content

Commit 04358b6

Browse files
committed
Merge pull request #5 from balhoff/categorical-data-enhancements
Categorical data enhancements
2 parents 0c4fc39 + 3cdd934 commit 04358b6

14 files changed

Lines changed: 5999 additions & 164 deletions

.gitignore

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
.DS_Store
2+
.classpath
3+
.project
4+
.settings
5+
bin

src/mesquite/nexml/InterpretNEXML/NexmlReaders/NexmlCharactersBlockReader.java

Lines changed: 45 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -23,12 +23,14 @@
2323
import org.nexml.model.Annotatable;
2424
import org.nexml.model.CategoricalMatrix;
2525
import org.nexml.model.Character;
26+
import org.nexml.model.CompoundCharacterState;
2627
import org.nexml.model.ContinuousMatrix;
2728
import org.nexml.model.Matrix;
2829
import org.nexml.model.MatrixCell;
2930
import org.nexml.model.MolecularMatrix;
3031
import org.nexml.model.OTU;
3132
import org.nexml.model.OTUs;
33+
import org.nexml.model.UncertainCharacterState;
3234

3335
/**
3436
* @author rvosa
@@ -70,7 +72,7 @@ else if ( xmlMatrix instanceof MolecularMatrix ) {
7072
return null;
7173
}
7274
}
73-
75+
7476
/**
7577
*
7678
* @param mesDataType
@@ -90,21 +92,48 @@ private FileElement readMatrix(String mesDataType,Matrix<?> xmlMatrix,MesquiteFi
9092
for ( Character xmlCharacter : xmlCharacterList ) {
9193
CharacterState mesCS = null;
9294
MatrixCell<?> xmlCell = xmlMatrix.getCell(xmlOTU, xmlCharacter);
93-
if ( mesMatrix instanceof ContinuousData ) {
94-
Double xmlDouble = (Double)xmlCell.getValue();
95-
if ( xmlDouble != null ) {
96-
mesCS = new ContinuousState(xmlDouble);
97-
((ContinuousState)mesCS).setNumItems(1); // XXX for multidimensional matrices
98-
}
99-
}
100-
else {
101-
org.nexml.model.CharacterState xmlState = (org.nexml.model.CharacterState)xmlCell.getValue();
102-
if ( xmlState != null ) {
103-
mesCS = new CategoricalState();
104-
String xmlSymbol = xmlState.getSymbol().toString();
105-
mesCS.setValue(xmlSymbol, mesMatrix);
106-
}
107-
}
95+
if ( mesMatrix instanceof ContinuousData ) {
96+
Double xmlDouble = (Double)xmlCell.getValue();
97+
if ( xmlDouble != null ) {
98+
mesCS = new ContinuousState(xmlDouble);
99+
((ContinuousState)mesCS).setNumItems(1); // XXX for multidimensional matrices
100+
}
101+
}
102+
else {
103+
if (xmlMatrix instanceof CategoricalMatrix) {
104+
for (org.nexml.model.CharacterState state : xmlCharacter.getCharacterStateSet().getCharacterStates()) {
105+
if (!(state instanceof CompoundCharacterState)) {
106+
String label = state.getLabel();
107+
if ((null != label) && (!label.equals("")) && (mesMatrix instanceof CategoricalData)) {
108+
int stateIndex = Integer.parseInt(state.getSymbol().toString());
109+
((CategoricalData)mesMatrix).setStateName(mesCharacter, stateIndex, label);
110+
}
111+
}
112+
}
113+
}
114+
org.nexml.model.CharacterState xmlState = (org.nexml.model.CharacterState)xmlCell.getValue();
115+
if ( xmlState != null ) {
116+
String xmlSymbol = xmlState.getSymbol().toString();
117+
if (xmlMatrix instanceof CategoricalMatrix) {
118+
long stateValue = CategoricalState.emptySet();
119+
if (xmlState instanceof CompoundCharacterState) {
120+
for (org.nexml.model.CharacterState state : ((CompoundCharacterState)xmlState).getStates()) {
121+
int memberSymbol = Integer.parseInt(state.getSymbol().toString());
122+
stateValue = CategoricalState.addToSet(stateValue, memberSymbol);
123+
}
124+
if (xmlState instanceof UncertainCharacterState) {
125+
stateValue = CategoricalState.setUncertainty(stateValue, true);
126+
}
127+
} else {
128+
stateValue = CategoricalState.makeSet(Integer.parseInt(xmlSymbol));
129+
}
130+
mesCS = new CategoricalState(stateValue);
131+
} else {
132+
mesCS = new CategoricalState();
133+
mesCS.setValue(xmlSymbol, mesMatrix);
134+
}
135+
}
136+
}
108137
if ( mesCS != null ) {
109138
mesMatrix.setState(mesCharacter, mesTaxon, mesCS);
110139
//can add in character state stuff here

src/mesquite/nexml/InterpretNEXML/NexmlWriters/NexmlCharactersBlockWriter.java

Lines changed: 122 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,13 @@
22

33
import java.util.ArrayList;
44
import java.util.HashMap;
5+
import java.util.HashSet;
56
import java.util.List;
67
import java.util.Map;
8+
import java.util.Set;
79

810
import mesquite.categ.lib.CategoricalData;
11+
import mesquite.categ.lib.CategoricalState;
912
import mesquite.categ.lib.DNAData;
1013
import mesquite.categ.lib.ProteinData;
1114
import mesquite.categ.lib.RNAData;
@@ -22,23 +25,31 @@
2225
import org.nexml.model.CategoricalMatrix;
2326
import org.nexml.model.Character;
2427
import org.nexml.model.CharacterStateSet;
28+
import org.nexml.model.CompoundCharacterState;
2529
import org.nexml.model.Document;
2630
import org.nexml.model.Matrix;
2731
import org.nexml.model.MatrixCell;
2832
import org.nexml.model.MolecularMatrix;
2933
import org.nexml.model.NexmlWritable;
3034
import org.nexml.model.OTU;
3135
import org.nexml.model.OTUs;
36+
import org.nexml.model.PolymorphicCharacterState;
37+
import org.nexml.model.UncertainCharacterState;
3238

3339
public class NexmlCharactersBlockWriter extends NexmlBlockWriter {
34-
40+
41+
/**
42+
* Generate symbols for uncertainties and polymorphisms that don't conflict with existing state symbols.
43+
*/
44+
private int nextMultipleStateSymbol = CategoricalState.getMaxPossibleStateStatic() + 1;
45+
3546
@SuppressWarnings("serial")
3647
private static final Map<String , String> xmlMolecularDataTypeFor = new HashMap<String, String>() {{
3748
put(DNAData.DATATYPENAME, MolecularMatrix.DNA);
3849
put(RNAData.DATATYPENAME, MolecularMatrix.RNA);
3950
put(ProteinData.DATATYPENAME, MolecularMatrix.Protein);
4051
}};
41-
52+
4253
/**
4354
*
4455
* @param employerEmployee
@@ -57,66 +68,101 @@ protected Annotatable writeBlock(Document xmlProject, FileElement mesBlock) {
5768
Taxa mesTaxa = mesData.getTaxa();
5869
OTUs xmlTaxa = findEquivalentTaxa(mesTaxa,xmlProject);
5970
org.nexml.model.Matrix<?> xmlMatrix = null;
60-
CharacterStateSet xmlCharacterStateSet = null;
6171
String mesDataType = mesData.getDataTypeName();
6272
if ( xmlMolecularDataTypeFor.containsKey(mesDataType) ) {
6373
xmlMatrix = xmlProject.createMolecularMatrix(xmlTaxa,xmlMolecularDataTypeFor.get(mesDataType));
64-
xmlCharacterStateSet = ((MolecularMatrix)xmlMatrix).getCharacterStateSet();
6574
}
6675
else if ( mesDataType.equalsIgnoreCase(CategoricalData.DATATYPENAME) ) {
6776
xmlMatrix = xmlProject.createCategoricalMatrix(xmlTaxa);
68-
xmlCharacterStateSet = ((CategoricalMatrix)xmlMatrix).createCharacterStateSet();
6977
}
7078
else if ( mesDataType.equalsIgnoreCase(ContinuousData.DATATYPENAME) ) {
7179
xmlMatrix = xmlProject.createContinuousMatrix(xmlTaxa);
7280
}
7381
else {
7482
MesquiteMessage.warnProgrammer("Can't write data type "+mesDataType);
75-
}
76-
writeCharacterStates(mesData, xmlMatrix, xmlCharacterStateSet);
83+
}
84+
writeCharacterStates(mesData, xmlMatrix);
7785
return xmlMatrix;
7886
}
79-
87+
8088
/**
8189
*
8290
* @param mesData
8391
* @param xmlMatrix
8492
* @param xmlCharacterStateSet
8593
*/
8694
@SuppressWarnings("unchecked")
87-
private void writeCharacterStates(CharacterData mesData, org.nexml.model.Matrix<?> xmlMatrix, CharacterStateSet xmlCharacterStateSet) {
95+
private void writeCharacterStates(CharacterData mesData, org.nexml.model.Matrix<?> xmlMatrix) {
8896
String mesDataType = mesData.getDataTypeName();
8997
int mesNchar = mesData.getNumChars();
9098
List<Character> xmlCharacters = new ArrayList<Character>(mesNchar);
91-
for ( int j = 0; j < mesNchar; j++ ) {
99+
for ( int characterIndex = 0; characterIndex < mesNchar; characterIndex++ ) {
100+
CharacterStateSet xmlCharacterStateSet = null;
101+
if ( xmlMolecularDataTypeFor.containsKey(mesDataType) ) {
102+
xmlCharacterStateSet = ((MolecularMatrix)xmlMatrix).getCharacterStateSet();
103+
}
104+
else if ( mesDataType.equalsIgnoreCase(CategoricalData.DATATYPENAME) ) {
105+
xmlCharacterStateSet = ((CategoricalMatrix)xmlMatrix).createCharacterStateSet();
106+
}
92107
Character xmlChar = xmlMatrix.createCharacter(xmlCharacterStateSet);
93-
String mesCharacterName = mesData.getCharacterName(j);
108+
String mesCharacterName = mesData.getCharacterName(characterIndex);
94109
if ( null != mesCharacterName && ! mesCharacterName.equals("") ) {
95110
xmlChar.setLabel(mesCharacterName);
96111
}
112+
if ( mesDataType.equalsIgnoreCase(CategoricalData.DATATYPENAME) ) {
113+
CategoricalData data = ((CategoricalData)mesData);
114+
int maxStateIndex = data.maxStateWithName(characterIndex);
115+
for (int stateIndex = 0; stateIndex <= maxStateIndex; stateIndex++) {
116+
String symbol = String.valueOf(data.getSymbol(stateIndex));
117+
org.nexml.model.CharacterState state = xmlChar.getCharacterStateSet().createCharacterState(symbol);
118+
state.setSymbol(symbol);
119+
if (data.hasStateName(characterIndex, stateIndex)) {
120+
String stateLabel = data.getStateName(characterIndex, stateIndex);
121+
state.setLabel(stateLabel);
122+
}
123+
}
124+
}
97125
xmlCharacters.add(xmlChar);
98126
}
99-
for ( int j = 0; j < mesData.getNumTaxa(); j++ ) {
100-
CharacterState[] mesChars = mesData.getCharacterStateArray(j, 0, mesNchar);
101-
Taxon mesTaxon = mesData.getTaxa().getTaxon(j);
127+
for (int taxonIndex = 0; taxonIndex < mesData.getNumTaxa(); taxonIndex++) {
128+
CharacterState[] mesCharStates = mesData.getCharacterStateArray(taxonIndex, 0, mesNchar);
129+
Taxon mesTaxon = mesData.getTaxa().getTaxon(taxonIndex);
102130
OTU xmlTaxon = findEquivalentTaxon(mesTaxon,xmlMatrix.getOTUs());
103-
for ( int k = 0; k < mesNchar; k++ ) {
104-
Character xmlChar = xmlCharacters.get(k);
105-
String mesCharString = mesChars[k].toDisplayString();
106-
if ( mesCharString != null && !mesCharString.equals("-") ) {
107-
if ( mesDataType.equalsIgnoreCase(ContinuousData.DATATYPENAME) ) {
108-
MatrixCell<Double> xmlCell = (MatrixCell<Double>) xmlMatrix.getCell(xmlTaxon,xmlChar);
109-
xmlCell.setValue((Double)xmlMatrix.parseSymbol(mesCharString));
131+
for ( int characterIndex = 0; characterIndex < mesNchar; characterIndex++ ) {
132+
Character xmlChar = xmlCharacters.get(characterIndex);
133+
CharacterState mesState = mesCharStates[characterIndex];
134+
if (mesDataType.equalsIgnoreCase(CategoricalData.DATATYPENAME)) {
135+
CharacterStateSet xmlStateSet = xmlChar.getCharacterStateSet();
136+
CategoricalData categoricalData = (CategoricalData)mesData;
137+
long stateAssignment = categoricalData.getState(characterIndex, taxonIndex);
138+
org.nexml.model.CharacterState xmlCharacterState = null;
139+
if (CategoricalState.hasMultipleStates(stateAssignment)) {
140+
Set<String> symbols = new HashSet<String>();
141+
for (int mesStateCode : CategoricalState.expand(stateAssignment)) {
142+
symbols.add(String.valueOf(categoricalData.getSymbol(mesStateCode)));
143+
}
144+
if (CategoricalState.isUncertain(stateAssignment)) {
145+
xmlCharacterState = findOrCreateUncertainStateSet(xmlStateSet, symbols);
146+
} else { //polymorphic
147+
xmlCharacterState = findOrCreatePolymorphicStateSet(xmlStateSet, symbols);
148+
}
149+
} else { // single state
150+
if ((!CategoricalState.isUnassigned(stateAssignment)) && (!CategoricalState.isInapplicable(stateAssignment))) {
151+
String symbol = String.valueOf(categoricalData.getSymbol(CategoricalState.getOnlyElement(stateAssignment)));
152+
xmlCharacterState = xmlStateSet.lookupCharacterStateBySymbol(symbol);
153+
}
110154
}
111-
else if ( mesDataType.equalsIgnoreCase(CategoricalData.DATATYPENAME) ) {
112-
MatrixCell<org.nexml.model.CharacterState> xmlCell = (MatrixCell<org.nexml.model.CharacterState>) xmlMatrix.getCell(xmlTaxon,xmlChar);
113-
xmlCell.setValue((org.nexml.model.CharacterState)xmlMatrix.parseSymbol(mesCharString));
155+
if (xmlCharacterState != null) {
156+
MatrixCell<org.nexml.model.CharacterState> xmlCell = (MatrixCell<org.nexml.model.CharacterState>) xmlMatrix.getCell(xmlTaxon, xmlChar);
157+
xmlCell.setValue(xmlCharacterState);
114158
}
115-
else if ( xmlMolecularDataTypeFor.containsKey(mesDataType) ) {
116-
MatrixCell<org.nexml.model.CharacterState> xmlCell = (MatrixCell<org.nexml.model.CharacterState>) xmlMatrix.getCell(xmlTaxon,xmlChar);
117-
xmlCell.setValue((org.nexml.model.CharacterState)((MolecularMatrix)xmlMatrix).parseSymbol(mesCharString,xmlMolecularDataTypeFor.get(mesDataType)));
118-
}
119-
}
159+
} else if (mesDataType.equalsIgnoreCase(ContinuousData.DATATYPENAME)) {
160+
MatrixCell<Double> xmlCell = (MatrixCell<Double>) xmlMatrix.getCell(xmlTaxon,xmlChar);
161+
xmlCell.setValue((Double)xmlMatrix.parseSymbol(mesState.toDisplayString(), xmlChar));
162+
} else if ( xmlMolecularDataTypeFor.containsKey(mesDataType) ) {
163+
MatrixCell<org.nexml.model.CharacterState> xmlCell = (MatrixCell<org.nexml.model.CharacterState>) xmlMatrix.getCell(xmlTaxon,xmlChar);
164+
xmlCell.setValue((org.nexml.model.CharacterState)((MolecularMatrix)xmlMatrix).parseSymbol(mesState.toDisplayString(), xmlMolecularDataTypeFor.get(mesDataType)));
165+
}
120166
}
121167
}
122168
}
@@ -131,4 +177,51 @@ protected Annotatable getThingInXmlBlock(NexmlWritable xmlBlock, int index) {
131177
return xmlMatrix.getCharacters().get(index);
132178
}
133179

180+
private UncertainCharacterState findOrCreateUncertainStateSet(CharacterStateSet containingStateSet, Set<String> symbols) {
181+
for (org.nexml.model.CharacterState state : containingStateSet.getCharacterStates()) {
182+
if (state instanceof UncertainCharacterState) {
183+
UncertainCharacterState uncertainState = (UncertainCharacterState)state;
184+
if (containsMatchingStates(uncertainState, symbols)) {
185+
return uncertainState;
186+
}
187+
}
188+
}
189+
Set<org.nexml.model.CharacterState> memberStates = collectMatchingStates(containingStateSet, symbols);
190+
return containingStateSet.createUncertainCharacterState(this.nextMultipleStateSymbol++, memberStates);
191+
}
192+
193+
private PolymorphicCharacterState findOrCreatePolymorphicStateSet(CharacterStateSet containingStateSet, Set<String> symbols) {
194+
for (org.nexml.model.CharacterState state : containingStateSet.getCharacterStates()) {
195+
if (state instanceof PolymorphicCharacterState) {
196+
PolymorphicCharacterState polymorphicState = (PolymorphicCharacterState)state;
197+
if (containsMatchingStates(polymorphicState, symbols)) {
198+
return polymorphicState;
199+
}
200+
}
201+
}
202+
Set<org.nexml.model.CharacterState> memberStates = collectMatchingStates(containingStateSet, symbols);
203+
return containingStateSet.createPolymorphicCharacterState(this.nextMultipleStateSymbol++, memberStates);
204+
}
205+
206+
private boolean containsMatchingStates(CompoundCharacterState state, Set<String> symbols) {
207+
Set<String> containedSymbols = new HashSet<String>();
208+
for (org.nexml.model.CharacterState containedState : state.getStates()) {
209+
containedSymbols.add(containedState.getSymbol().toString());
210+
}
211+
return containedSymbols.equals(symbols);
212+
}
213+
214+
private Set<org.nexml.model.CharacterState> collectMatchingStates(CharacterStateSet containingStateSet, Set<String> symbols) {
215+
Set<org.nexml.model.CharacterState> memberStates = new HashSet<org.nexml.model.CharacterState>();
216+
for (String symbol : symbols) {
217+
org.nexml.model.CharacterState member = containingStateSet.lookupCharacterStateBySymbol(symbol);
218+
if ( null != member ) {
219+
memberStates.add(member);
220+
} else {
221+
memberStates.add(containingStateSet.createCharacterState(symbol));
222+
}
223+
}
224+
return memberStates;
225+
}
226+
134227
}

src/org/nexml/model/Matrix.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ public interface Matrix<T> extends OTUsLinkable, Annotatable, Segmented<Characte
6969
* @param symbol
7070
* @return
7171
*/
72-
T parseSymbol(String symbol);
72+
T parseSymbol(String symbol, Character character);
7373

7474
/**
7575
* Creates a row element for OTU otu, and populates

src/org/nexml/model/impl/AnnotationImpl.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -221,7 +221,8 @@ public void setValue(Set<Annotation> value) {
221221
mValue = value;
222222
getElement().setAttribute(XSI_TYPE,ResourceMeta);
223223
for ( Annotation annotation : value ) {
224-
getElement().appendChild(((AnnotationImpl)annotation).getElement());
224+
Node node = getElement().getOwnerDocument().adoptNode(((AnnotationImpl)annotation).getElement());
225+
getElement().appendChild(node);
225226
}
226227
}
227228

0 commit comments

Comments
 (0)