Skip to content

Commit 684df8a

Browse files
committed
Handle surrogate pair characters substring
1 parent 0bbc8eb commit 684df8a

6 files changed

Lines changed: 149 additions & 144 deletions

File tree

Lines changed: 137 additions & 136 deletions
Original file line numberDiff line numberDiff line change
@@ -1,136 +1,137 @@
1-
/*
2-
* Copyright (c) 2007-2019 LabKey Corporation
3-
*
4-
* Licensed under the Apache License, Version 2.0 (the "License");
5-
* you may not use this file except in compliance with the License.
6-
* You may obtain a copy of the License at
7-
*
8-
* http://www.apache.org/licenses/LICENSE-2.0
9-
*
10-
* Unless required by applicable law or agreed to in writing, software
11-
* distributed under the License is distributed on an "AS IS" BASIS,
12-
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13-
* See the License for the specific language governing permissions and
14-
* limitations under the License.
15-
*/
16-
package org.labkey.flow;
17-
18-
import org.apache.commons.lang3.StringUtils;
19-
import org.apache.xmlbeans.XmlError;
20-
import org.apache.xmlbeans.XmlException;
21-
import org.apache.xmlbeans.XmlOptions;
22-
import org.fhcrc.cpas.flow.script.xml.ScriptDef;
23-
import org.fhcrc.cpas.flow.script.xml.ScriptDocument;
24-
import org.xml.sax.SAXParseException;
25-
26-
import java.io.StringReader;
27-
import java.util.ArrayList;
28-
import java.util.List;
29-
30-
public class ScriptParser
31-
{
32-
List<Error> _errors;
33-
ScriptDef _script;
34-
35-
public ScriptParser()
36-
{
37-
}
38-
39-
static public class Error
40-
{
41-
String _message;
42-
int _line;
43-
int _column;
44-
45-
public Error(String message)
46-
{
47-
this(message, 0, 0);
48-
}
49-
50-
public Error(String message, int line, int column)
51-
{
52-
_message = message;
53-
_line = line;
54-
_column = column;
55-
}
56-
57-
public Error(SAXParseException spe)
58-
{
59-
this(spe.getLocalizedMessage(), spe.getLineNumber(), spe.getColumnNumber());
60-
}
61-
62-
public String getMessage()
63-
{
64-
return _message;
65-
}
66-
67-
public int getLine()
68-
{
69-
return _line;
70-
}
71-
72-
public int getColumn()
73-
{
74-
return _column;
75-
}
76-
}
77-
78-
public void parse(String script)
79-
{
80-
try
81-
{
82-
XmlOptions options = new XmlOptions();
83-
List<XmlError> errors = new ArrayList<>();
84-
options.setDocumentType(ScriptDocument.type);
85-
script = StringUtils.replace(script, "<script>", "<script xmlns=\"" + ScriptDocument.type.getContentModel().getName().getNamespaceURI() + "\">");
86-
ScriptDocument doc = ScriptDocument.Factory.parse(new StringReader(script), options);
87-
options.setErrorListener(errors);
88-
89-
if (!doc.validate(options))
90-
{
91-
for (XmlError xmlError : errors)
92-
{
93-
String message = xmlError.getMessage();
94-
message = StringUtils.replace(message, "@" + ScriptDocument.type.getContentModel().getName().getNamespaceURI(), "");
95-
String location = xmlError.getCursorLocation().xmlText();
96-
if (location.length() > 100)
97-
location = location.substring(0, 100);
98-
addError(new Error("Schema Validation Error: " + message + "\nLocation of invalid XML: " + location));
99-
}
100-
}
101-
_script = doc.getScript();
102-
}
103-
catch (XmlException xmlException)
104-
{
105-
if (xmlException.getErrors().isEmpty())
106-
{
107-
addError(new Error(xmlException.toString()));
108-
}
109-
else
110-
{
111-
for (XmlError xmlError : xmlException.getErrors())
112-
{
113-
addError(new Error(xmlError.getMessage(), xmlError.getLine(), xmlError.getColumn() > 0 ? xmlError.getColumn() : 1));
114-
}
115-
}
116-
}
117-
catch (Exception e)
118-
{
119-
addError(new Error(e.toString()));
120-
}
121-
}
122-
123-
void addError(Error error)
124-
{
125-
if (_errors == null)
126-
_errors = new ArrayList<>();
127-
_errors.add(error);
128-
}
129-
130-
public Error[] getErrors()
131-
{
132-
if (_errors == null || _errors.isEmpty())
133-
return null;
134-
return _errors.toArray(new Error[0]);
135-
}
136-
}
1+
/*
2+
* Copyright (c) 2007-2019 LabKey Corporation
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
package org.labkey.flow;
17+
18+
import org.apache.commons.lang3.StringUtils;
19+
import org.apache.xmlbeans.XmlError;
20+
import org.apache.xmlbeans.XmlException;
21+
import org.apache.xmlbeans.XmlOptions;
22+
import org.fhcrc.cpas.flow.script.xml.ScriptDef;
23+
import org.fhcrc.cpas.flow.script.xml.ScriptDocument;
24+
import org.labkey.api.util.StringUtilsLabKey;
25+
import org.xml.sax.SAXParseException;
26+
27+
import java.io.StringReader;
28+
import java.util.ArrayList;
29+
import java.util.List;
30+
31+
public class ScriptParser
32+
{
33+
List<Error> _errors;
34+
ScriptDef _script;
35+
36+
public ScriptParser()
37+
{
38+
}
39+
40+
static public class Error
41+
{
42+
String _message;
43+
int _line;
44+
int _column;
45+
46+
public Error(String message)
47+
{
48+
this(message, 0, 0);
49+
}
50+
51+
public Error(String message, int line, int column)
52+
{
53+
_message = message;
54+
_line = line;
55+
_column = column;
56+
}
57+
58+
public Error(SAXParseException spe)
59+
{
60+
this(spe.getLocalizedMessage(), spe.getLineNumber(), spe.getColumnNumber());
61+
}
62+
63+
public String getMessage()
64+
{
65+
return _message;
66+
}
67+
68+
public int getLine()
69+
{
70+
return _line;
71+
}
72+
73+
public int getColumn()
74+
{
75+
return _column;
76+
}
77+
}
78+
79+
public void parse(String script)
80+
{
81+
try
82+
{
83+
XmlOptions options = new XmlOptions();
84+
List<XmlError> errors = new ArrayList<>();
85+
options.setDocumentType(ScriptDocument.type);
86+
script = StringUtils.replace(script, "<script>", "<script xmlns=\"" + ScriptDocument.type.getContentModel().getName().getNamespaceURI() + "\">");
87+
ScriptDocument doc = ScriptDocument.Factory.parse(new StringReader(script), options);
88+
options.setErrorListener(errors);
89+
90+
if (!doc.validate(options))
91+
{
92+
for (XmlError xmlError : errors)
93+
{
94+
String message = xmlError.getMessage();
95+
message = StringUtils.replace(message, "@" + ScriptDocument.type.getContentModel().getName().getNamespaceURI(), "");
96+
String location = xmlError.getCursorLocation().xmlText();
97+
if (location.length() > 100)
98+
location = StringUtilsLabKey.leftSurrogatePairFriendly(location, 100);
99+
addError(new Error("Schema Validation Error: " + message + "\nLocation of invalid XML: " + location));
100+
}
101+
}
102+
_script = doc.getScript();
103+
}
104+
catch (XmlException xmlException)
105+
{
106+
if (xmlException.getErrors().isEmpty())
107+
{
108+
addError(new Error(xmlException.toString()));
109+
}
110+
else
111+
{
112+
for (XmlError xmlError : xmlException.getErrors())
113+
{
114+
addError(new Error(xmlError.getMessage(), xmlError.getLine(), xmlError.getColumn() > 0 ? xmlError.getColumn() : 1));
115+
}
116+
}
117+
}
118+
catch (Exception e)
119+
{
120+
addError(new Error(e.toString()));
121+
}
122+
}
123+
124+
void addError(Error error)
125+
{
126+
if (_errors == null)
127+
_errors = new ArrayList<>();
128+
_errors.add(error);
129+
}
130+
131+
public Error[] getErrors()
132+
{
133+
if (_errors == null || _errors.isEmpty())
134+
return null;
135+
return _errors.toArray(new Error[0]);
136+
}
137+
}

protein/api-src/org/labkey/api/protein/ProteinManager.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
import org.labkey.api.util.HashHelpers;
2121
import org.labkey.api.util.HtmlString;
2222
import org.labkey.api.util.LinkBuilder;
23+
import org.labkey.api.util.StringUtilsLabKey;
2324
import org.labkey.api.view.NotFoundException;
2425

2526
import java.io.ByteArrayOutputStream;
@@ -100,7 +101,7 @@ private static SimpleProtein ensureProteinInDatabase(String sequence, Organism o
100101
map.put("Mass", PeptideHelpers.computeMass(sequenceBytes, 0, sequenceBytes.length, PeptideHelpers.AMINO_ACID_AVERAGE_MASSES));
101102
map.put("OrgId", organism.getOrgId());
102103
map.put("Hash", hashSequence(sequence));
103-
map.put("Description", description == null ? null : (description.length() > 200 ? description.substring(0, 196) + "..." : description));
104+
map.put("Description", description == null ? null : (description.length() > 200 ? StringUtilsLabKey.leftSurrogatePairFriendly(description, 196) + "..." : description));
104105
map.put("BestName", name);
105106
map.put("Length", sequence.length());
106107
map.put("InsertDate", new Date());

protein/api-src/org/labkey/api/protein/ProteinPlus.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818

1919
import org.labkey.api.protein.fasta.FastaProtein;
2020
import org.labkey.api.util.HashHelpers;
21+
import org.labkey.api.util.StringUtilsLabKey;
2122

2223
public class ProteinPlus
2324
{
@@ -111,7 +112,7 @@ public String getBestName()
111112
{
112113
result = getProtein().getHeader();
113114
}
114-
if (result.length() > 500) result = result.substring(0, 499);
115+
if (result.length() > 500) result = StringUtilsLabKey.leftSurrogatePairFriendly(result, 499);
115116
return result;
116117
}
117118
}

protein/api-src/org/labkey/api/protein/fasta/FastaDbLoader.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
import org.labkey.api.protein.organism.OrganismGuessStrategy;
3838
import org.labkey.api.util.HashHelpers;
3939
import org.labkey.api.util.NetworkDrive;
40+
import org.labkey.api.util.StringUtilsLabKey;
4041
import org.labkey.api.view.ViewBackgroundInfo;
4142

4243
import java.io.File;
@@ -259,7 +260,7 @@ protected void preProcessSequences(List<ProteinPlus> mouthful, Connection c, Log
259260
}
260261
else
261262
{
262-
if (desc.length() >= 200) desc = desc.substring(0, 195) + "...";
263+
if (desc.length() >= 200) desc = StringUtilsLabKey.leftSurrogatePairFriendly(desc, 195) + "...";
263264
fdbu._addSeqStmt.setString(3, desc);
264265
}
265266
fdbu._addSeqStmt.setDouble(4, curSeq.getProtein().getMass());

protein/api-src/org/labkey/api/protein/fasta/IdPattern.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
import java.util.regex.Matcher;
2828
import java.util.regex.Pattern;
2929
import java.util.regex.PatternSyntaxException;
30+
import org.labkey.api.util.StringUtilsLabKey;
3031

3132
/**
3233
* this class implements a regular expression-based recognition of identifiers parsed from the fasta files.
@@ -178,7 +179,7 @@ public static Map<String, Set<String>> createIdMap(String key, String value)
178179
{
179180
v = v.trim();
180181
if (v.length() > 50)
181-
v = v.substring(0, 50);
182+
v = StringUtilsLabKey.leftSurrogatePairFriendly(v, 50);
182183
if (!v.isEmpty())
183184
vals.add(v);
184185
}

protein/api-src/org/labkey/api/protein/uniprot/uniprot.java

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -605,7 +605,7 @@ public int insertSequences(ParseContext context, Connection conn) throws SQLExce
605605
else
606606
{
607607
String tmp = curSeq.getDescription();
608-
if (tmp.length() >= 200) tmp = tmp.substring(0, 190) + "...";
608+
if (tmp.length() >= 200) tmp = StringUtilsLabKey.leftSurrogatePairFriendly(tmp, 190) + "...";
609609
_addSeq.setString(3, tmp);
610610
}
611611
if (curSeq.getSourceChangeDate() == null)
@@ -657,7 +657,7 @@ public int insertSequences(ParseContext context, Connection conn) throws SQLExce
657657
else
658658
{
659659
String tmp = curSeq.getBestName();
660-
if (tmp.length() >= 50) tmp = tmp.substring(0, 45) + "...";
660+
if (tmp.length() >= 50) tmp = StringUtilsLabKey.leftSurrogatePairFriendly(tmp, 45) + "...";
661661
_addSeq.setString(11, tmp);
662662
}
663663
if (curSeq.getBestGeneName() == null)
@@ -667,7 +667,7 @@ public int insertSequences(ParseContext context, Connection conn) throws SQLExce
667667
else
668668
{
669669
String tmp = curSeq.getBestGeneName();
670-
if (tmp.length() >= 50) tmp = tmp.substring(0, 45) + "...";
670+
if (tmp.length() >= 50) tmp = StringUtilsLabKey.leftSurrogatePairFriendly(tmp, 45) + "...";
671671
_addSeq.setString(12, tmp);
672672
}
673673
// Timestamp at index 13 is set once for the whole prepared statement
@@ -709,7 +709,7 @@ public int insertIdentifiers(ParseContext context, Connection conn) throws SQLEx
709709
{
710710
transactionCount++;
711711
String curIdentVal = curIdent.getIdentifier();
712-
if (curIdentVal.length() > 50) curIdentVal = curIdentVal.substring(0, 45) + "...";
712+
if (curIdentVal.length() > 50) curIdentVal = StringUtilsLabKey.leftSurrogatePairFriendly(curIdentVal, 45) + "...";
713713
_addIdent.setString(1, curIdentVal);
714714
_addIdent.setString(2, curIdent.getIdentType());
715715
UniprotSequence curSeq = curIdent.getSequence();

0 commit comments

Comments
 (0)