diff --git a/build.sbt b/build.sbt index ff0bbf6f7..466c68524 100644 --- a/build.sbt +++ b/build.sbt @@ -1,16 +1,19 @@ val scala211 = "2.11.12" // up to 2.11.12 -val scala212 = "2.12.17" // up to 2.12.18 -val scala213 = "2.13.10" // up to 2.13.11 +val scala212 = "2.12.18" // up to 2.12.18 +val scala213 = "2.13.12" // up to 2.13.12 val scala30 = "3.0.2" // up to 3.0.2 val scala31 = "3.1.3" // up to 3.1.3 -val scala32 = "3.2.1" // up to 3.2.2 -val scala33 = "3.3.0" // up to 3.3.0 +val scala32 = "3.2.2" // up to 3.2.2 +val scala33 = "3.3.1" // up to 3.3.1 + +val scala3 = scala31 // See https://www.scala-lang.org/blog/2022/08/17/long-term-compatibility-plans.html. // Scala30: "If you are maintaining a library, you should drop Scala 3.0." Dropped. -// Scala31: This is the current LTS (long term support) version and default Scala 3 release. +// Scala31: This is a LTS (long term support) version before it was called that. // Scala32: This is for experimentation, as in Scala Next, and not for release. -ThisBuild / crossScalaVersions := Seq(scala212, scala211, scala213, scala31) +// Scala33: This is the first official LTS, but hold off until necessary. +ThisBuild / crossScalaVersions := Seq(scala212, scala211, scala213, scala3) ThisBuild / scalaVersion := crossScalaVersions.value.head lazy val root = (project in file(".")) diff --git a/corenlp/src/main/scala/org/clulab/processors/TextLabelToCoNNLU.scala b/corenlp/src/main/scala/org/clulab/processors/TextLabelToCoNNLU.scala index a379feef7..cdfc6e581 100644 --- a/corenlp/src/main/scala/org/clulab/processors/TextLabelToCoNNLU.scala +++ b/corenlp/src/main/scala/org/clulab/processors/TextLabelToCoNNLU.scala @@ -1,13 +1,15 @@ package org.clulab.processors -import java.io.{File, FileFilter, PrintWriter} import org.clulab.processors.clu.BalaurProcessor import org.clulab.processors.fastnlp.FastNLPProcessor +import org.clulab.struct.GraphMap import org.clulab.utils.{FileUtils, Sourcer, StringUtils} import org.slf4j.{Logger, LoggerFactory} + +import java.io.{File, FileFilter, PrintWriter} +import scala.util.Using + import TextLabelToCoNLLU._ -import org.clulab.struct.GraphMap -import org.clulab.utils.Closer.AutoCloser /** * Processes raw text and saves the output in the CoNLL-U format @@ -24,9 +26,9 @@ class TextLabelToCoNLLU(val proc:Processor, val isCoreNLP:Boolean) { try { val doc = parseFile(f) val ofn = s"$outDir/${f.getName.substring(0, f.getName.length - 4)}.conllu" - val pw = new PrintWriter(ofn) - toCoNLLU(doc, pw) - pw.close() + Using.resource(new PrintWriter(ofn)) { pw => + toCoNLLU(doc, pw) + } } catch { case e:Exception => { logger.error(s"Parsing of file $f failed with error:") @@ -77,7 +79,7 @@ class TextLabelToCoNLLU(val proc:Processor, val isCoreNLP:Boolean) { def parseFile(f:File):Document = { def option1(): Document = { - val tokens = Sourcer.sourceFromFile(f).autoClose { source => + val tokens = Using.resource(Sourcer.sourceFromFile(f)) { source => for (line <- source.getLines()) yield line.split(' ').toSeq }.toSeq diff --git a/corenlp/src/main/scala/org/clulab/processors/TextToCoNLLU.scala b/corenlp/src/main/scala/org/clulab/processors/TextToCoNLLU.scala index 1f0955432..c76514525 100644 --- a/corenlp/src/main/scala/org/clulab/processors/TextToCoNLLU.scala +++ b/corenlp/src/main/scala/org/clulab/processors/TextToCoNLLU.scala @@ -1,13 +1,16 @@ package org.clulab.processors -import java.io.{File, FileFilter, PrintWriter} import org.clulab.processors.clu.BalaurProcessor import org.clulab.processors.fastnlp.FastNLPProcessor +import org.clulab.struct.GraphMap import org.clulab.utils.StringUtils import org.slf4j.{Logger, LoggerFactory} + +import java.io.{File, FileFilter, PrintWriter} +import scala.util.Using + import TextToCoNLLU._ -import org.clulab.struct.GraphMap /** * Processes raw text and saves the output in the CoNLL-U format @@ -24,9 +27,9 @@ class TextToCoNLLU(val proc:Processor, val isCoreNLP:Boolean) { try { val doc = parseFile(f) val ofn = s"$outDir/${f.getName.substring(0, f.getName.length - 4)}.conllu" - val pw = new PrintWriter(ofn) - toCoNLLU(doc, pw) - pw.close() + Using.resource(new PrintWriter(ofn)) { pw => + toCoNLLU(doc, pw) + } } catch { case e:Exception => { logger.error(s"Parsing of file $f failed with error:") @@ -65,13 +68,13 @@ class TextToCoNLLU(val proc:Processor, val isCoreNLP:Boolean) { } def parseFile(f:File):Document = { - val s = scala.io.Source.fromFile(f) val buffer = new StringBuilder - for(line <- s.getLines()) { - buffer.append(line) - buffer.append("\n") + Using.resource(scala.io.Source.fromFile(f)) { s => + for (line <- s.getLines()) { + buffer.append(line) + buffer.append("\n") + } } - s.close() val doc = proc.mkDocument(buffer.toString()) annotate(doc) diff --git a/corenlp/src/main/scala/org/clulab/processors/corenlp/CoreNLPDocument.scala b/corenlp/src/main/scala/org/clulab/processors/corenlp/CoreNLPDocument.scala index 776370124..f5ddcbff9 100644 --- a/corenlp/src/main/scala/org/clulab/processors/corenlp/CoreNLPDocument.scala +++ b/corenlp/src/main/scala/org/clulab/processors/corenlp/CoreNLPDocument.scala @@ -14,14 +14,14 @@ class CoreNLPDocument(sentences: Array[Sentence]) extends Document(sentences) { var annotation:Option[Annotation] = None - def copy(document: CoreNLPDocument): CoreNLPDocument = { - super.copy(document) + def assimilate(document: CoreNLPDocument, textOpt: Option[String]): CoreNLPDocument = { + super.assimilate(document, textOpt) annotation = document.annotation this } - override def copy(sentences: Array[Sentence] = sentences): CoreNLPDocument = - new CoreNLPDocument(sentences).copy(this) + override def copy(sentences: Array[Sentence] = sentences, textOpt: Option[String] = text): CoreNLPDocument = + new CoreNLPDocument(sentences).assimilate(this, textOpt) override def clear(): Unit = { //println("Clearing state from document.") diff --git a/corenlp/src/main/scala/org/clulab/processors/corenlp/chunker/TrainChunker.scala b/corenlp/src/main/scala/org/clulab/processors/corenlp/chunker/TrainChunker.scala index 3501d682a..92f949aeb 100644 --- a/corenlp/src/main/scala/org/clulab/processors/corenlp/chunker/TrainChunker.scala +++ b/corenlp/src/main/scala/org/clulab/processors/corenlp/chunker/TrainChunker.scala @@ -1,11 +1,13 @@ package org.clulab.processors.corenlp.chunker +import edu.stanford.nlp.ling.{ CoreLabel, CoreAnnotations } import org.clulab.scala.WrappedArray._ + import java.io.FileInputStream import java.util.zip.GZIPInputStream import scala.collection.mutable import scala.io.Source -import edu.stanford.nlp.ling.{ CoreLabel, CoreAnnotations } +import scala.util.Using object TrainChunker extends App { @@ -63,9 +65,9 @@ object TrainChunker extends App { def readData(path: String): Array[Array[CoreLabel]] = { val is = new GZIPInputStream(new FileInputStream(path)) - val source = Source.fromInputStream(is) - val text = source.mkString - source.close() + val text = Using.resource(Source.fromInputStream(is)) { source => + source.mkString + } // sentences are separated by an empty line val sentences = text.split("\n\n") sentences.map { sent => diff --git a/corenlp/src/main/scala/org/clulab/processors/examples/DocumentSerializerExample.scala b/corenlp/src/main/scala/org/clulab/processors/examples/DocumentSerializerExample.scala index 0e1dce85e..cf6781151 100644 --- a/corenlp/src/main/scala/org/clulab/processors/examples/DocumentSerializerExample.scala +++ b/corenlp/src/main/scala/org/clulab/processors/examples/DocumentSerializerExample.scala @@ -1,9 +1,10 @@ package org.clulab.processors.examples -import java.io.{BufferedReader, FileReader} - import org.clulab.serialization.DocumentSerializer +import java.io.{BufferedReader, FileReader} +import scala.util.Using + /** * * User: mihais @@ -11,21 +12,21 @@ import org.clulab.serialization.DocumentSerializer */ object DocumentSerializerExample { def main(args:Array[String]): Unit = { - val ds = new DocumentSerializer - val r = new BufferedReader(new FileReader(args(0))) - var done = false var count = 0 - while(! done) { - val d = ds.load(r) - if(d == null) { - done = true - } else { - count += 1 - if(count % 10 == 0) - println(s"Loaded $count documents...") + Using.resource(new BufferedReader(new FileReader(args(0)))) { r => + val ds = new DocumentSerializer + var done = false + while (!done) { + val d = ds.load(r) + if (d == null) { + done = true + } else { + count += 1 + if (count % 10 == 0) + println(s"Loaded $count documents...") + } } } - r.close() println(s"Done! Loaded $count documents.") } } diff --git a/corenlp/src/main/scala/org/clulab/processors/examples/DocumentationExample.scala b/corenlp/src/main/scala/org/clulab/processors/examples/DocumentationExample.scala index 0073cc5a2..624433af4 100644 --- a/corenlp/src/main/scala/org/clulab/processors/examples/DocumentationExample.scala +++ b/corenlp/src/main/scala/org/clulab/processors/examples/DocumentationExample.scala @@ -8,7 +8,7 @@ import org.clulab.struct.DirectedGraphEdgeIterator object DocumentationExample extends App { // Create the processor. Any processor works here! - // Try FastNLPProcessor or our own CluProcessor. + // Try FastNLPProcessor or our own BalaurProcessor. val proc: Processor = new CoreNLPProcessor() // val proc: Processor = new FastNLPProcessor() diff --git a/corenlp/src/main/scala/org/clulab/processors/examples/InfiniteParallelProcessorExample.scala b/corenlp/src/main/scala/org/clulab/processors/examples/InfiniteParallelProcessorExample.scala index fe176fa7b..b64540afa 100644 --- a/corenlp/src/main/scala/org/clulab/processors/examples/InfiniteParallelProcessorExample.scala +++ b/corenlp/src/main/scala/org/clulab/processors/examples/InfiniteParallelProcessorExample.scala @@ -4,17 +4,14 @@ import org.clulab.processors.Document import org.clulab.processors.Processor import org.clulab.processors.fastnlp.FastNLPProcessor import org.clulab.serialization.DocumentSerializer -import org.clulab.utils.Closer.AutoCloser -import org.clulab.utils.FileUtils -import org.clulab.utils.ThreadUtils -import org.clulab.utils.Timer +import org.clulab.utils.{FileUtils, StringUtils, ThreadUtils, Timer} import java.io.BufferedOutputStream import java.io.File import java.io.FileOutputStream import java.io.PrintWriter -import java.io.StringWriter import scala.collection.parallel.ParSeq +import scala.util.Using object InfiniteParallelProcessorExample { @@ -49,15 +46,8 @@ object InfiniteParallelProcessorExample { val text = FileUtils.getTextFromFile(file) val outputFile = new File(outputDir + "/" + file.getName) val document = processor.annotate(text) - val printedDocument = { - val stringWriter = new StringWriter - - new PrintWriter(stringWriter).autoClose { printWriter => - printDocument(document, printWriter) - } - - val result = stringWriter.toString - result + val printedDocument = StringUtils.viaPrintWriter { printWriter => + printDocument(document, printWriter) } val savedDocument = documentSerializer.save(document) val outputDocument = printedDocument + savedDocument @@ -86,7 +76,7 @@ object InfiniteParallelProcessorExample { def run(args: Array[String]): Unit = { mainWithCallback(args) { case (file: File, contents: String) => - new PrintWriter(new BufferedOutputStream(new FileOutputStream(file))).autoClose { printWriter => + Using.resource(new PrintWriter(file)) { printWriter => printWriter.println(contents) } } diff --git a/corenlp/src/main/scala/org/clulab/processors/examples/ParallelProcessorExample.scala b/corenlp/src/main/scala/org/clulab/processors/examples/ParallelProcessorExample.scala index b7d0f0501..be7037578 100644 --- a/corenlp/src/main/scala/org/clulab/processors/examples/ParallelProcessorExample.scala +++ b/corenlp/src/main/scala/org/clulab/processors/examples/ParallelProcessorExample.scala @@ -1,19 +1,14 @@ package org.clulab.processors.examples -import java.io.BufferedOutputStream -import java.io.File -import java.io.FileOutputStream -import java.io.PrintWriter -import java.io.StringWriter import org.clulab.processors.Document import org.clulab.processors.Processor import org.clulab.processors.clu.BalaurProcessor -import org.clulab.processors.fastnlp.FastNLPProcessor import org.clulab.serialization.DocumentSerializer -import org.clulab.utils.Closer.AutoCloser -import org.clulab.utils.FileUtils -import org.clulab.utils.ThreadUtils -import org.clulab.utils.Timer +import org.clulab.utils.{FileUtils, StringUtils, ThreadUtils, Timer} + +import java.io.File +import java.io.PrintWriter +import scala.util.Using object ParallelProcessorExample { @@ -25,7 +20,7 @@ object ParallelProcessorExample { val outputDir = args(1) val extension = args(2) val threads = args(3).toInt - val parallel = args.lift(4).exists(_ == "true") + val parallel = args.lift(4).contains("true") val files = FileUtils.findFiles(inputDir, extension) val serFiles = files.sortBy(-_.length) @@ -60,15 +55,8 @@ object ParallelProcessorExample { println(s"Threw exception for ${file.getName}") throw throwable } - val printedDocument = { - val stringWriter = new StringWriter - - new PrintWriter(stringWriter).autoClose { printWriter => - printDocument(document, printWriter) - } - - val result = stringWriter.toString - result + val printedDocument = StringUtils.viaPrintWriter { printWriter => + printDocument(document, printWriter) } val savedDocument = documentSerializer.save(document) val outputDocument = printedDocument + savedDocument @@ -83,7 +71,7 @@ object ParallelProcessorExample { def run(args: Array[String]): Unit = { mainWithCallback(args) { case (file: File, contents: String) => - new PrintWriter(new BufferedOutputStream(new FileOutputStream(file))).autoClose { printWriter => + Using.resource(new PrintWriter(file)) { printWriter => printWriter.println(contents) } } diff --git a/corenlp/src/main/scala/org/clulab/processors/examples/ProcessorExample.scala b/corenlp/src/main/scala/org/clulab/processors/examples/ProcessorExample.scala index 50b2c972a..f582c89b2 100644 --- a/corenlp/src/main/scala/org/clulab/processors/examples/ProcessorExample.scala +++ b/corenlp/src/main/scala/org/clulab/processors/examples/ProcessorExample.scala @@ -26,7 +26,7 @@ object ProcessorExample { // other processors supported: // BioNLPProcessor, and FastBioNLPProcessor - for the biomedical domain - // CluProcessor - similar to FastNLPProcessor, but using tools licensed under the Apache license + // BalaurProcessor - similar to FastNLPProcessor, but using tools licensed under the Apache license // the actual work is done here val doc = proc.annotate("John Smith went to China. He visited Beijing, on January 10th, 2013.") diff --git a/corenlp/src/test/scala/org/clulab/processors/TestOpenIE.scala b/corenlp/src/test/scala/org/clulab/processors/TestOpenIE.scala index a36ee000b..3bbc3b229 100644 --- a/corenlp/src/test/scala/org/clulab/processors/TestOpenIE.scala +++ b/corenlp/src/test/scala/org/clulab/processors/TestOpenIE.scala @@ -4,9 +4,7 @@ import org.clulab.processors.corenlp.CoreNLPProcessor import org.clulab.processors.fastnlp.FastNLPProcessor import org.clulab.processors.shallownlp.ShallowNLPProcessor import org.clulab.serialization.DocumentSerializer -import org.clulab.utils.Test - -import java.io.{PrintWriter, StringWriter} +import org.clulab.utils.{StringUtils, Test} import scala.collection.mutable @@ -20,10 +18,9 @@ class TestOpenIE extends Test { private lazy val fastNLPDoc = fastNLP.annotate(text) private lazy val coreNLPDoc = coreNLP.annotate(text) - private val buffer = new StringWriter() - serializer.save(fastNLPDoc, new PrintWriter(buffer)) - private val serialized = buffer.toString - + private val serialized = StringUtils.viaPrintWriter { printWriter => + serializer.save(fastNLPDoc, printWriter) + } private val deserializedDoc = serializer.load(serialized) def openIEBehavior(doc:Document): Unit = { diff --git a/corenlp/src/test/scala/org/clulab/processors/TestParenthesesInCore.scala b/corenlp/src/test/scala/org/clulab/processors/TestParenthesesInCore.scala index 6bb0f4b50..bd11d6715 100644 --- a/corenlp/src/test/scala/org/clulab/processors/TestParenthesesInCore.scala +++ b/corenlp/src/test/scala/org/clulab/processors/TestParenthesesInCore.scala @@ -13,7 +13,7 @@ import org.clulab.utils.Test class TestParenthesesInCore extends Test { val fast = new FastNLPProcessor() - "CluProcessor" should "tokenize, lemmatize, and POS tag parentheses correctly" in { + "Processor" should "tokenize, lemmatize, and POS tag parentheses correctly" in { val doc = fast.mkDocument("Moreover, in von Willebrand factor-stimulated platelets, the tyrosine phosphorylation of pp60(c-src) is closely associated with the activation of phosphatidylinositol 3-kinase (PIK), and two adhesion receptors, glycoprotein (Gp)Ib and GpIIb/IIIa(alpha-IIb-beta(3)), are involved. ") fast.tagPartsOfSpeech(doc) fast.lemmatize(doc) diff --git a/corenlp/src/test/scala/org/clulab/processors/TestRepeatability.scala b/corenlp/src/test/scala/org/clulab/processors/TestRepeatability.scala index 53870c7dc..dc1565a38 100644 --- a/corenlp/src/test/scala/org/clulab/processors/TestRepeatability.scala +++ b/corenlp/src/test/scala/org/clulab/processors/TestRepeatability.scala @@ -1,27 +1,17 @@ package org.clulab.processors -import org.clulab.processors.examples.ParallelProcessorExample import org.clulab.processors.fastnlp.FastNLPProcessor -import org.clulab.serialization.DocumentSerializer -import org.clulab.utils.FileUtils -import org.clulab.utils.Sourcer.utf8 -import org.clulab.utils.Test +import org.clulab.utils.{FileUtils, Sourcer, StringUtils, Test} import java.io.File -import java.io.PrintWriter -import java.io.StringWriter -import scala.collection.mutable -import scala.io.Source +import scala.util.Using class TestRepeatability extends Test { def printDocument(document: Document): String = { - val stringWriter = new StringWriter - val printWriter = new PrintWriter(stringWriter) - - document.prettyPrint(printWriter) - printWriter.close() - stringWriter.toString + StringUtils.viaPrintWriter { printWriter => + document.prettyPrint(printWriter) + } } val processor: Processor = new FastNLPProcessor() @@ -32,10 +22,9 @@ class TestRepeatability extends Test { val inputDir = FileUtils.getSubprojectDir("./corenlp/src/test/resources/documents") val file = new File(inputDir + "/16_South Sudan - Key Message Update_ Thu, 2018-01-25.txt") val text = { - val source = Source.fromFile(file, utf8) - val text = source.mkString.replace("\r\n", "\n") - - source.close() + val text = Using.resource(Sourcer.sourceFromFile(file)) { source => + source.mkString.replace("\r\n", "\n") + } val beginIndex = text.indexOf("This\nanalysis") val endIndex = text.indexOf("*According to the IPC") diff --git a/docs/Gemfile.lock b/docs/Gemfile.lock index 5c012cc96..977858c6b 100644 --- a/docs/Gemfile.lock +++ b/docs/Gemfile.lock @@ -215,7 +215,7 @@ GEM jekyll-seo-tag (~> 2.1) minitest (5.17.0) multipart-post (2.1.1) - nokogiri (1.13.10-x86_64-linux) + nokogiri (1.14.3-x86_64-linux) racc (~> 1.4) octokit (4.20.0) faraday (>= 0.9) diff --git a/main/build.sbt b/main/build.sbt index 8e52f0ce3..77cca9a4e 100644 --- a/main/build.sbt +++ b/main/build.sbt @@ -47,9 +47,9 @@ libraryDependencies ++= { "org.json4s" %% "json4s-core" % json4sVersion, // Apache-2.0 "org.json4s" %% "json4s-jackson" % json4sVersion, // Apache-2.0 // for machine learning - "org.clulab" % "deberta-onnx-model" % "0.0.4", + "org.clulab" % "deberta-onnx-model" % "0.1.0", // "org.clulab" % "roberta-onnx-model" % "0.0.2", - "org.clulab" %% "scala-transformers-encoder" % "0.3.1-SNAPSHOT", // Apache-2.0 + "org.clulab" %% "scala-transformers-encoder" % "0.4.0", // Apache-2.0 "de.bwaldvogel" % "liblinear" % "2.30", // BSD-3 "tw.edu.ntu.csie" % "libsvm" % "3.23", // BSD // NLP tools used by CluProcessor @@ -64,7 +64,7 @@ libraryDependencies ++= { // Local logging is provided here but not published. "ch.qos.logback" % "logback-classic" % "1.2.8", // up to 1.2.8; less than 1.2 is vulnerable // testing - "org.scalatest" %% "scalatest" % "3.2.10" % Test, // Apache-2.0 + "org.scalatest" %% "scalatest" % "3.2.15" % Test, // Apache-2.0 // trained models for local ML models used in both main and corenlp // These are stored in the CLU lab Artifactory instance, not maven! "org.clulab" % "glove-840b-300d-10f-kryo" % "1.0.0", // Apache-2.0 @@ -72,7 +72,8 @@ libraryDependencies ++= { // for odin "org.apache.commons" % "commons-text" % "1.1", // Apache-2.0 // See https://docs.scala-lang.org/overviews/core/collections-migration-213.html. - "org.scala-lang.modules" %% "scala-collection-compat" % "2.6.0", // up to 2.9.0, but match fatdynet // Apache-2.0 + // fatdynet 0.4.4 uses 2.6.0 which will be evicted. Move to fatdynet 0.4.5 for a 2.11.0 match. + "org.scala-lang.modules" %% "scala-collection-compat" % "2.11.0", // up to 2.11.0 // Apache-2.0 "org.scala-lang.modules" %% "scala-parser-combinators" % combinatorsVersion, // Apache-2.0 "org.yaml" % "snakeyaml" % "1.14", // Apache-2.0 // progress bar for training diff --git a/main/src/main/resources/org/clulab/numeric/MEASUREMENT-UNIT.tsv b/main/src/main/resources/org/clulab/numeric/MEASUREMENT-UNIT.tsv index 951633180..e2a48ec6e 100644 --- a/main/src/main/resources/org/clulab/numeric/MEASUREMENT-UNIT.tsv +++ b/main/src/main/resources/org/clulab/numeric/MEASUREMENT-UNIT.tsv @@ -27,6 +27,7 @@ yd // yd::distance foot // ft::length feet // ft::length ft // ft::length +ft. // ft::length inch // in::length inches // in::length in // in::length @@ -63,7 +64,9 @@ ton // t::mass t // t::mass carat // carat::mass pound // lb::mass +pounds // lb::mass lb // lb::mass +lbs // lb::mass ounce // oz::mass oz // oz::mass fl oz // oz::volume diff --git a/main/src/main/resources/org/clulab/numeric/SEASON.tsv b/main/src/main/resources/org/clulab/numeric/SEASON.tsv index bb9070f7b..bd37426a5 100644 --- a/main/src/main/resources/org/clulab/numeric/SEASON.tsv +++ b/main/src/main/resources/org/clulab/numeric/SEASON.tsv @@ -8,7 +8,7 @@ winter // XXXX-12-21 -- XXXX-03-20 spring // XXXX-03-20 -- XXXX-06-21 summer // XXXX-06-21 -- XXXX-09-22 autumn // XXXX-09-22 -- XXXX-12-21 - +fall // XXXX-09-22 -- XXXX-12-21 diff --git a/main/src/main/resources/org/clulab/numeric/WEEK.tsv b/main/src/main/resources/org/clulab/numeric/WEEK.tsv new file mode 100644 index 000000000..39db02765 --- /dev/null +++ b/main/src/main/resources/org/clulab/numeric/WEEK.tsv @@ -0,0 +1,16 @@ +# +# list of weeks and their date ranges, case insensitive so everything is lower case for simplicity +# the comments after // are required by WeekNormalizer to get the week date ranges! Do not remove +# the format for the date ranges must be MM-dd:MM-dd or MM:MM +# note: multi-word phrases must be tokenized in the same way as our tokenizer. If not sure, try the phrases in ./shell first! +# +first week // XXXX-XX-01 -- XXXX-XX-07 +1st week // XXXX-XX-01 -- XXXX-XX-07 +second week // XXXX-XX-08 -- XXXX-XX-14 +2nd week // XXXX-XX-08 -- XXXX-XX-14 +third week // XXXX-XX-15 -- XXXX-XX-21 +3rd week // XXXX-XX-15 -- XXXX-XX-21 +fourth week // XXXX-XX-22 -- XXXX-XX-28 +4th week // XXXX-XX-22 -- XXXX-XX-28 +first two weeks // XXXX-XX-01 -- XXXX-XX-14 +second two weeks // XXXX-XX-15 -- XXXX-XX-28 \ No newline at end of file diff --git a/main/src/main/resources/org/clulab/numeric/atomic.yml b/main/src/main/resources/org/clulab/numeric/atomic.yml index b508665c9..e880e0af0 100644 --- a/main/src/main/resources/org/clulab/numeric/atomic.yml +++ b/main/src/main/resources/org/clulab/numeric/atomic.yml @@ -24,6 +24,14 @@ rules: pattern: | [word=/^(1\d\d\d|20\d\d)$/] + # weak possible years: 1d, 2d, 3d, 4d, 5d, 6d, 7d, 8d, 9d + - name: weakyear + label: WeakPossibleYear + priority: ${ rulepriority } + type: token + pattern: | + [word=/^[1-9]\d$/] + # possible day values, from 1 to 31 - name: day label: PossibleDay diff --git a/main/src/main/resources/org/clulab/numeric/date-ranges.yml b/main/src/main/resources/org/clulab/numeric/date-ranges.yml index f8f67942d..9b483a47d 100644 --- a/main/src/main/resources/org/clulab/numeric/date-ranges.yml +++ b/main/src/main/resources/org/clulab/numeric/date-ranges.yml @@ -85,6 +85,24 @@ pattern: | /(?i)until|through/ @date1:Date +- name: date-range-9 + priority: ${rulepriority} + label: DateRange + type: token + example: "First week of May" + action: mkDateRangeMentionWithWeek + pattern: | + (? /(?i)(first|1st|second|2nd|third|3rd|fourth|4th|last)/ /(?i)week/) /(?i)of/ @month:PossibleMonth + +- name: date-range-10 + priority: ${rulepriority} + label: DateRange + type: token + example: "First two weeks of May" + action: mkDateRangeMentionWithWeek + pattern: | + (? /(?i)(first|second|last)/ /(?i)two/ /(?i)weeks/) /(?i)of/ @month:PossibleMonth + - name: date-unbound-range-1 priority: ${rulepriority} label: DateRange @@ -113,6 +131,16 @@ pattern: | @season:PossibleSeason /(?i)(in|of)/? @year:PossibleYear? +# Date range derived from a season, with weak but mandatory year +- name: date-range-season-1b + priority: ${ rulepriority } + label: DateRange + type: token + example: "It was summer in 21" + action: mkDateRangeMentionWithSeason + pattern: | + @season:PossibleSeason /(?i)(in|of)/? @year:WeakPossibleYear + - name: date-range-season-2 priority: ${ rulepriority } label: DateRange diff --git a/main/src/main/resources/org/clulab/numeric/master.yml b/main/src/main/resources/org/clulab/numeric/master.yml index 9c5dec5b0..67143b6e0 100644 --- a/main/src/main/resources/org/clulab/numeric/master.yml +++ b/main/src/main/resources/org/clulab/numeric/master.yml @@ -7,6 +7,7 @@ taxonomy: - PossibleMonth - PossibleSeason - PossibleYear + - WeakPossibleYear - Number - Subatomic: - NumberWord diff --git a/main/src/main/scala-3/org/clulab/odinstarter/OdinStarter3.scala b/main/src/main/scala-3/org/clulab/odinstarter/OdinStarter3.scala index a24c5dda5..fa9dfa73d 100644 --- a/main/src/main/scala-3/org/clulab/odinstarter/OdinStarter3.scala +++ b/main/src/main/scala-3/org/clulab/odinstarter/OdinStarter3.scala @@ -2,7 +2,7 @@ package org.clulab.odinstarter import org.clulab.odin.ExtractorEngine import org.clulab.odin.Mention -import org.clulab.processors.clu.CluProcessor +import org.clulab.processors.clu.BalaurProcessor import org.clulab.sequences.LexiconNER import org.clulab.utils.FileUtils @@ -27,7 +27,7 @@ object OdinStarter3: val baseDirOpt = if isLocal then Some(resourceDir) else None LexiconNER(kbs, caseInsensitiveMatchings, baseDirOpt) - val processor = new CluProcessor(optionalNER = Some(customLexiconNer)) + val processor = new BalaurProcessor(optionalNER = Some(customLexiconNer)) val extractorEngine = val masterResource = "/org/clulab/odinstarter/main.yml" // We usually want to reload rules during development, diff --git a/main/src/main/scala/org/clulab/embeddings/CompactWordEmbeddingMap.scala b/main/src/main/scala/org/clulab/embeddings/CompactWordEmbeddingMap.scala index ffe614cf8..bc4db7d75 100644 --- a/main/src/main/scala/org/clulab/embeddings/CompactWordEmbeddingMap.scala +++ b/main/src/main/scala/org/clulab/embeddings/CompactWordEmbeddingMap.scala @@ -8,7 +8,6 @@ import org.clulab.scala.WrappedArray._ import org.clulab.scala.WrappedArrayBuffer._ import org.clulab.utils.ArrayView import org.clulab.utils.ClassLoaderObjectInputStream -import org.clulab.utils.Closer.AutoCloser import org.clulab.utils.Logging import org.clulab.utils.MutableArrayView import org.clulab.utils.Sourcer @@ -20,6 +19,7 @@ import scala.collection.mutable.ArrayBuffer import scala.collection.mutable.{ArrayBuilder => MutableArrayBuilder} import scala.collection.mutable.{HashMap => MutableHashMap} import scala.io.Source +import scala.util.Using /** * This class and its companion object have been backported from Eidos. There it is/was an optional @@ -211,7 +211,7 @@ class CompactWordEmbeddingMap(protected val buildType: CompactWordEmbeddingMap.B map.toArray.sortBy(_._2).map(_._1).mkString("\n") def save(filename: String): Unit = { - new ObjectOutputStream(new BufferedOutputStream(new FileOutputStream(filename))).autoClose { objectOutputStream => + Using.resource(new ObjectOutputStream(new BufferedOutputStream(new FileOutputStream(filename)))) { objectOutputStream => objectOutputStream.writeObject(mkTextFromMap()) objectOutputStream.writeObject(array) objectOutputStream.writeObject(buildType.unknownArray.orNull) @@ -222,7 +222,7 @@ class CompactWordEmbeddingMap(protected val buildType: CompactWordEmbeddingMap.B def saveKryo(filename: String): Unit = { val kryo = CompactWordEmbeddingMap.newKryo() - new Output(new BufferedOutputStream(new FileOutputStream(filename))).autoClose { output => + Using.resource(new Output(new BufferedOutputStream(new FileOutputStream(filename)))) { output => kryo.writeObject(output, mkTextFromMap()) kryo.writeObject(output, array) kryo.writeObject(output, buildType.unknownArray.orNull) @@ -273,7 +273,7 @@ object CompactWordEmbeddingMap extends Logging { loadTxt(Source.fromInputStream(inputStream, StandardCharsets.ISO_8859_1.toString)) def loadTxt(source: Source): BuildType = { - source.autoClose { source => + Using.resource(source) { source => val lines = source.getLines() buildMatrix(lines) @@ -306,7 +306,7 @@ object CompactWordEmbeddingMap extends Logging { def loadSer(filename: String): BuildType = loadSer(new FileInputStream(filename)) def loadSer(inputStream: InputStream): BuildType = { - new ClassLoaderObjectInputStream(this.getClass.getClassLoader, new BufferedInputStream(inputStream)).autoClose { objectInputStream => + Using.resource(new ClassLoaderObjectInputStream(this.getClass.getClassLoader, new BufferedInputStream(inputStream))) { objectInputStream => val map = mkMapFromText(objectInputStream.readObject().asInstanceOf[String]) val array = objectInputStream.readObject().asInstanceOf[Array[Float]] val unknownArrayOpt = Option(objectInputStream.readObject().asInstanceOf[Array[Float]]) @@ -328,7 +328,7 @@ object CompactWordEmbeddingMap extends Logging { def loadKryo(inputStream: InputStream): BuildType = { val kryo = newKryo() - new Input(new BufferedInputStream(inputStream)).autoClose { input => + Using.resource(new Input(new BufferedInputStream(inputStream))) { input => val map = mkMapFromText(kryo.readObject(input, classOf[String])) val array = kryo.readObject(input, classOf[Array[Float]]) val unknownArrayOpt = Option(kryo.readObjectOrNull(input, classOf[Array[Float]])) diff --git a/main/src/main/scala/org/clulab/embeddings/CullVectors.scala b/main/src/main/scala/org/clulab/embeddings/CullVectors.scala index dfafaa893..f30fba840 100644 --- a/main/src/main/scala/org/clulab/embeddings/CullVectors.scala +++ b/main/src/main/scala/org/clulab/embeddings/CullVectors.scala @@ -1,11 +1,11 @@ package org.clulab.embeddings -import java.io.File - -import org.clulab.utils.Closer.AutoCloser import org.clulab.utils.Sinker import org.clulab.utils.Sourcer +import java.io.File +import scala.util.Using + // Expect this to use lots of memory. object CullVectors extends App { // This should be something like glove.840B.300d.txt. @@ -84,7 +84,7 @@ object CullVectors extends App { ) // This is Map[word, (index, freq)]. The index is used for separating frequent from infrequent words. // The freq is used to eventually weight the vectors for each word when words are combined into single vectors. - val wordFrequencies: Map[String, (Int, Int)] = Sourcer.sourceFromFile(inFrequencyFile).autoClose { source => + val wordFrequencies: Map[String, (Int, Int)] = Using.resource(Sourcer.sourceFromFile(inFrequencyFile)) { source => val counter = Counter(-1) val frequentWords = source .getLines() @@ -99,7 +99,7 @@ object CullVectors extends App { frequentWords } - val (columns, badFloats, goodLines) = Sourcer.sourceFromFile(inVectorFile).autoClose { source => + val (columns, badFloats, goodLines) = Using.resource(Sourcer.sourceFromFile(inVectorFile)) { source => val bufferedLines = source.getLines().buffered val line = bufferedLines.head val columns = { @@ -135,7 +135,7 @@ object CullVectors extends App { val badLine = badStrings.mkString(" ", " ", "") // The \n is to force LF as eol even on Windows. - Sinker.printWriterFromFile(outputFile, append = false).autoClose { printWriter => + Using.resource(Sinker.printWriterFromFile(outputFile, append = false)) { printWriter => printWriter.print(count.toString + " " + columns) printWriter.print("\n") printWriter.print(badLine) diff --git a/main/src/main/scala/org/clulab/embeddings/ExplicitWordEmbeddingMap.scala b/main/src/main/scala/org/clulab/embeddings/ExplicitWordEmbeddingMap.scala index 50975efa4..d7d09a698 100644 --- a/main/src/main/scala/org/clulab/embeddings/ExplicitWordEmbeddingMap.scala +++ b/main/src/main/scala/org/clulab/embeddings/ExplicitWordEmbeddingMap.scala @@ -1,19 +1,37 @@ package org.clulab.embeddings -import java.io._ import org.clulab.scala.BufferedIterator import org.clulab.scala.WrappedArray._ import org.clulab.utils.ClassLoaderObjectInputStream -import org.clulab.utils.Closer.AutoCloser import org.clulab.utils.Logging import org.clulab.utils.Sourcer import java.nio.charset.StandardCharsets +import java.io._ import scala.collection.mutable.{HashMap => MutableHashMap} import scala.io.Source +import scala.util.Using /** - * Implements an word embedding map, where each embedding is stored as a distinct array + * Implements a word embedding map where each embedding is stored as a distinct array. + * + * This class accommodates glove embedding files, either with or without the header line + * that has sometimes been inserted into files to indicate the number of rows and columns + * of vector values and with an optional vector for unknown words. An assortment of glove + * files packaged into jars is available from [[https://artifactory.clulab.org CLU Lab's Artifactory server]] + * and more can be downloaded in text format from the [[https://nlp.stanford.edu/projects/glove/ GloVe website]]. + * + * The jarred variants make it possible to include word embeddings as a library dependency + * and to read the files as resources. A resource flag is included in several methods for + * this eventuality. The original text files can be extracted manually from the jars if + * need be. Embeddings are read from the filesystem when resource = false, which is the + * default. Some CLU Lab glove files in circulation have an empty word (blank string) + * inserted, usually as the first word in the file. The associated vector can be used for + * unknown words in place of a zero or random vector and instead of leaving out words. + * The words in a glove file have (usually) had their case preserved, so for most accurate + * results, treat other words the same. + * + * A simple example is included in [[org.clulab.embeddings.ExplicitWordEmbeddingMap\$.main main]]. */ class ExplicitWordEmbeddingMap(protected val buildType: ExplicitWordEmbeddingMap.BuildType) extends WordEmbeddingMap { val map: ExplicitWordEmbeddingMap.ImplMapType = buildType.map @@ -135,7 +153,7 @@ class ExplicitWordEmbeddingMap(protected val buildType: ExplicitWordEmbeddingMap } def save(filename: String): Unit = { - new ObjectOutputStream(new BufferedOutputStream(new FileOutputStream(filename))).autoClose { objectOutputStream => + Using.resource(new ObjectOutputStream(new BufferedOutputStream(new FileOutputStream(filename)))) { objectOutputStream => objectOutputStream.writeObject(map) objectOutputStream.writeObject(buildType.unknownArray) } @@ -179,10 +197,10 @@ object ExplicitWordEmbeddingMap extends Logging { } protected def loadTxt(filename: String, resource: Boolean): BuildType = { - ( + Using.resource( if (resource) Sourcer.sourceFromResource(filename, StandardCharsets.ISO_8859_1.toString) else Sourcer.sourceFromFilename(filename, StandardCharsets.ISO_8859_1.toString) - ).autoClose { source => + ) { source => val lines = source.getLines() buildMatrix(lines) @@ -190,7 +208,7 @@ object ExplicitWordEmbeddingMap extends Logging { } protected def loadBin(filename: String): BuildType = { - new ClassLoaderObjectInputStream(this.getClass.getClassLoader, new BufferedInputStream(new FileInputStream(filename))).autoClose { objectInputStream => + Using.resource(new ClassLoaderObjectInputStream(this.getClass.getClassLoader, new BufferedInputStream(new FileInputStream(filename)))) { objectInputStream => loadBin(objectInputStream) } } @@ -247,4 +265,18 @@ object ExplicitWordEmbeddingMap extends Logging { require(wordCountOpt.get == total, s"The matrix file should have had ${wordCountOpt.get} lines of words.") BuildType(map, unknownWeightsOpt) } + + def main(args: Array[String]): Unit = { + println("Syntax: ") + val filename = args.lift(0).getOrElse("glove.840B.300d.10f.txt") + val count = args.lift(1).getOrElse("10").toInt + val argsWords = args.slice(2, args.length).toSet + val words = if (argsWords.isEmpty) Set("house") else argsWords + val wordEmbeddingMap = ExplicitWordEmbeddingMap(filename, resource = false) + val mostSimilarWords = wordEmbeddingMap.mostSimilarWords(words, count) + + mostSimilarWords.zipWithIndex.foreach { case ((word, similarity), index) => + println(s"$index $word $similarity") + } + } } diff --git a/main/src/main/scala/org/clulab/embeddings/LemmatizeEmbeddings.scala b/main/src/main/scala/org/clulab/embeddings/LemmatizeEmbeddings.scala index 98adefb3d..add38c148 100644 --- a/main/src/main/scala/org/clulab/embeddings/LemmatizeEmbeddings.scala +++ b/main/src/main/scala/org/clulab/embeddings/LemmatizeEmbeddings.scala @@ -1,11 +1,11 @@ package org.clulab.embeddings -import java.io.PrintWriter - import org.clulab.processors.clu.tokenizer.EnglishLemmatizer import org.clulab.struct.Counter +import java.io.PrintWriter import scala.collection.mutable +import scala.util.Using /** * Generates embeddings for lemmas, by averaging GloVe embeddings for words that have the same lemma @@ -125,15 +125,15 @@ object LemmatizeEmbeddings { val le = new LemmatizeEmbeddings(freqFile, embedFile) val lemmaEmbeddings = le.lemmatize() - val pw = new PrintWriter(outputFile) - for(lemma <- lemmaEmbeddings.keySet) { - pw.print(lemma) - val v = lemmaEmbeddings(lemma) - for(i <- v.indices) { - pw.print(" " + v(i)) + Using.resource(new PrintWriter(outputFile)) { pw => + for (lemma <- lemmaEmbeddings.keySet) { + pw.print(lemma) + val v = lemmaEmbeddings(lemma) + for (i <- v.indices) { + pw.print(" " + v(i)) + } + pw.println() } - pw.println() } - pw.close() } } diff --git a/main/src/main/scala/org/clulab/embeddings/SanitizedWordEmbeddingMap.scala b/main/src/main/scala/org/clulab/embeddings/SanitizedWordEmbeddingMap.scala index f2d7e0eb2..ef45e6cb0 100644 --- a/main/src/main/scala/org/clulab/embeddings/SanitizedWordEmbeddingMap.scala +++ b/main/src/main/scala/org/clulab/embeddings/SanitizedWordEmbeddingMap.scala @@ -1,14 +1,14 @@ package org.clulab.embeddings -import java.io._ -import java.nio.{ByteBuffer, ByteOrder} - import org.apache.commons.io.{FileUtils, IOUtils} import org.clulab.utils.MathUtils import org.slf4j.{Logger, LoggerFactory} +import java.io._ +import java.nio.{ByteBuffer, ByteOrder} import scala.collection.mutable.ArrayBuffer import scala.io.Source +import scala.util.Using /** * Implements similarity metrics using the embedding matrix @@ -43,13 +43,13 @@ class SanitizedWordEmbeddingMap(matrixConstructor: => Map[String, Array[Double]] val matrix : Map[String, Array[Double]] = matrixConstructor def saveMatrix(mf: String): Unit = { - val pw = new PrintWriter(mf) - pw.println(s"${matrix.size}, $dimensions") - for ((word, vec) <- matrix) { - val strRep = vec.map(_.formatted("%.6f")).mkString(" ") - pw.println(s"$word $strRep") + Using.resource(new PrintWriter(mf)) { pw => + pw.println(s"${matrix.size}, $dimensions") + for ((word, vec) <- matrix) { + val strRep = vec.map(_.formatted("%.6f")).mkString(" ") + pw.println(s"$word $strRep") + } } - pw.close() } /** If the word doesn't exist in the lexicon, try to use UNK */ @@ -413,24 +413,24 @@ object SanitizedWordEmbeddingMap { wordsToUse: Option[Set[String]], caseInsensitiveWordsToUse:Boolean):(Map[String, Array[Double]], Int) = { logger.debug("Started to load embedding matrix from file " + mf + "...") - val src: Source = Source.fromFile(mf, "iso-8859-1") - val lines: Iterator[String] = src.getLines() - val matrix = buildMatrix(lines, wordsToUse, caseInsensitiveWordsToUse) - src.close() - logger.debug("Completed matrix loading.") - matrix + Using.resource(Source.fromFile(mf, "iso-8859-1")) { src => + val lines: Iterator[String] = src.getLines() + val matrix = buildMatrix(lines, wordsToUse, caseInsensitiveWordsToUse) + logger.debug("Completed matrix loading.") + matrix + } } private def loadMatrixFromStream(is: InputStream, wordsToUse: Option[Set[String]], caseInsensitiveWordsToUse:Boolean):(Map[String, Array[Double]], Int) = { logger.debug("Started to load embedding matrix from stream ...") - val src: Source = Source.fromInputStream(is, "iso-8859-1") - val lines: Iterator[String] = src.getLines() - val matrix = buildMatrix(lines, wordsToUse, caseInsensitiveWordsToUse) - src.close() - logger.debug("Completed matrix loading.") - matrix + Using.resource(Source.fromInputStream(is, "iso-8859-1")) { src => + val lines: Iterator[String] = src.getLines() + val matrix = buildMatrix(lines, wordsToUse, caseInsensitiveWordsToUse) + logger.debug("Completed matrix loading.") + matrix + } } private def loadMatrixFromSource(src: Source, wordsToUse: Option[Set[String]], diff --git a/main/src/main/scala/org/clulab/embeddings/WordEmbeddingMap.scala b/main/src/main/scala/org/clulab/embeddings/WordEmbeddingMap.scala index 531610089..f5266d9ad 100644 --- a/main/src/main/scala/org/clulab/embeddings/WordEmbeddingMap.scala +++ b/main/src/main/scala/org/clulab/embeddings/WordEmbeddingMap.scala @@ -1,5 +1,8 @@ package org.clulab.embeddings +import org.clulab.scala.WrappedArray._ +import org.clulab.utils.MathUtils + import scala.collection.mutable.{IndexedSeq => MutableIndexedSeq} /** @@ -41,6 +44,30 @@ trait WordEmbeddingMap { /** Save this object in binary format. */ def save(filename: String): Unit + + /** filterPredicate: if passed, only returns words that match the predicate */ + def mostSimilarWords(vector: Array[Float], howMany: Int, filterPredicateOpt: Option[String => Boolean]): Seq[(String, Double)] = { + val unfilteredKeys = keys + val filteredKeys = filterPredicateOpt.map(unfilteredKeys.filter).getOrElse(unfilteredKeys) + val result = MathUtils.nBest[String](word => WordEmbeddingMap.dotProduct(vector, getOrElseUnknown(word)).toDouble)(filteredKeys, howMany) + + result + } + + /** + * Finds the words most similar to this set of inputs + * + * IMPORTANT: Words here must already be normalized to match how they are stored in the map. + * + * This method is included to support the interface of the deprecated [[org.clulab.embeddings.SanitizedWordEmbeddingMap SanitizedWordEmbeddingMap]]. + * Unknown words may be skipped in calculating the composite or the unknown vector might be + * used. That is decided by the subclass. This method calls only public member functions, + * so reimplement or subclass for alternative behavior. + */ + def mostSimilarWords(words: Set[String], howMany: Int): Seq[(String, Double)] = { + val compositeVector = makeCompositeVector(words) + mostSimilarWords(compositeVector, howMany, None) + } } object WordEmbeddingMap { diff --git a/main/src/main/scala/org/clulab/embeddings/WordEmbeddingMapPool.scala b/main/src/main/scala/org/clulab/embeddings/WordEmbeddingMapPool.scala index 6124cda90..fafdbb484 100644 --- a/main/src/main/scala/org/clulab/embeddings/WordEmbeddingMapPool.scala +++ b/main/src/main/scala/org/clulab/embeddings/WordEmbeddingMapPool.scala @@ -1,6 +1,5 @@ package org.clulab.embeddings -import org.clulab.utils.Closer.AutoCloser import org.clulab.utils.InputStreamer import org.clulab.utils.InputStreamer.StreamResult import org.clulab.utils.NamedFuture @@ -10,6 +9,8 @@ import scala.concurrent.Await import scala.concurrent.Future import scala.concurrent.duration.Duration +import scala.util.Using + /** Manages a pool of word embedding maps, so we do not load them more than once */ object WordEmbeddingMapPool { @@ -72,7 +73,7 @@ object WordEmbeddingMapPool { def loadEmbedding(name: String, fileLocation: String, resourceLocation: String, compact: Boolean): WordEmbeddingMap = { val StreamResult(inputStream, _, format) = inputStreamer.stream(name, fileLocation, resourceLocation) .getOrElse(throw new RuntimeException(s"WordEmbeddingMap $name could not be opened.")) - val wordEmbeddingMap = inputStream.autoClose { inputStream => + val wordEmbeddingMap = Using.resource(inputStream) { inputStream => val binary = format == InputStreamer.Format.Bin if (compact) CompactWordEmbeddingMap(inputStream, binary) diff --git a/main/src/main/scala/org/clulab/learning/Classifier.scala b/main/src/main/scala/org/clulab/learning/Classifier.scala index 43901a8cd..4aa135a65 100644 --- a/main/src/main/scala/org/clulab/learning/Classifier.scala +++ b/main/src/main/scala/org/clulab/learning/Classifier.scala @@ -1,10 +1,11 @@ package org.clulab.learning -import java.io._ - import org.clulab.struct.Counter import org.clulab.learning.Datasets._ +import java.io._ +import scala.util.Using + /** * Trait for iid classification * For reranking problems, see RankingClassifier @@ -38,9 +39,9 @@ trait Classifier[L, F] { /** Saves the current model to a file */ def saveTo(fileName:String): Unit = { - val bw = new BufferedWriter(new FileWriter(fileName)) - saveTo(bw) - bw.close() + Using.resource(new BufferedWriter(new FileWriter(fileName))) { bw => + saveTo(bw) + } } /** Saves to writer. Does NOT close the writer */ diff --git a/main/src/main/scala/org/clulab/learning/Dataset.scala b/main/src/main/scala/org/clulab/learning/Dataset.scala index 615b7808e..f6464f8ee 100644 --- a/main/src/main/scala/org/clulab/learning/Dataset.scala +++ b/main/src/main/scala/org/clulab/learning/Dataset.scala @@ -1,18 +1,19 @@ package org.clulab.learning -import scala.collection.mutable -import scala.collection.mutable.{ArrayBuffer, ListBuffer} import org.clulab.struct.Counter import org.clulab.struct.Lexicon - -import scala.io.{BufferedSource, Source} -import java.util.zip.GZIPInputStream -import java.io.{FileWriter, PrintWriter} -import org.slf4j.{Logger, LoggerFactory} -import RVFDataset._ import org.clulab.utils.Files +import org.slf4j.{Logger, LoggerFactory} +import java.io.PrintWriter +import java.util.zip.GZIPInputStream +import scala.collection.mutable +import scala.collection.mutable.{ArrayBuffer, ListBuffer} +import scala.io.{BufferedSource, Source} import scala.reflect.ClassTag +import scala.util.Using + +import RVFDataset._ /** * Parent class for classification datasets @@ -453,25 +454,25 @@ object RVFDataset { featureLexicon:Lexicon[String], fn:String): Unit = { - val os = new PrintWriter(new FileWriter(fn)) - for(datum <- datums) { - os.print(datum.label) - val fs = new ListBuffer[(Int, Double)] - val c = datum.featuresCounter - for(k <- c.keySet) { - val fi = featureLexicon.get(k) - if(fi.isDefined) { - // logger.debug(s"Feature [$k] converted to index ${fi.get + 1}") - fs += ((fi.get + 1, c.getCount(k))) + Using.resource(new PrintWriter(fn)) { os => + for (datum <- datums) { + os.print(datum.label) + val fs = new ListBuffer[(Int, Double)] + val c = datum.featuresCounter + for (k <- c.keySet) { + val fi = featureLexicon.get(k) + if (fi.isDefined) { + // logger.debug(s"Feature [$k] converted to index ${fi.get + 1}") + fs += ((fi.get + 1, c.getCount(k))) + } } + val fss = fs.toList.sortBy(_._1) + for (t <- fss) { + os.print(s" ${t._1}:${t._2}") + } + os.println() } - val fss = fs.toList.sortBy(_._1) - for(t <- fss) { - os.print(s" ${t._1}:${t._2}") - } - os.println() } - os.close() } def mkDatumsFromSvmLightResource(path: String): Iterable[Datum[Int, String]] = { diff --git a/main/src/main/scala/org/clulab/learning/Datasets.scala b/main/src/main/scala/org/clulab/learning/Datasets.scala index 1aa7d0425..21516a8b0 100644 --- a/main/src/main/scala/org/clulab/learning/Datasets.scala +++ b/main/src/main/scala/org/clulab/learning/Datasets.scala @@ -32,11 +32,11 @@ object Datasets { val trainFolds = new ArrayBuffer[(Int, Int)] if(startTest > 0) - trainFolds += Tuple2(0, startTest) + trainFolds += ((0, startTest)) if(endTest < size) - trainFolds += Tuple2(endTest, size) + trainFolds += ((endTest, size)) - folds += new DatasetFold(Tuple2(startTest, endTest), trainFolds.toList) + folds += new DatasetFold((startTest, endTest), trainFolds.toList) } folds.toList } @@ -54,7 +54,7 @@ object Datasets { private def mkFullFold(size:Int): Iterable[(Int, Int)] = { val folds = new Array[(Int, Int)](1) - folds(0) = Tuple2(0, size) + folds(0) = (0, size) folds } @@ -344,7 +344,7 @@ object Datasets { for(i <- fold.testFold._1 until fold.testFold._2) { val sys = classifier.classOf(dataset.mkDatum(i)) val gold = dataset.labels(i) - output += Tuple2(dataset.labelLexicon.get(gold), sys) + output += ((dataset.labelLexicon.get(gold), sys)) } } diff --git a/main/src/main/scala/org/clulab/learning/LibLinearClassifier.scala b/main/src/main/scala/org/clulab/learning/LibLinearClassifier.scala index 2b28653f4..8350326ff 100644 --- a/main/src/main/scala/org/clulab/learning/LibLinearClassifier.scala +++ b/main/src/main/scala/org/clulab/learning/LibLinearClassifier.scala @@ -1,14 +1,17 @@ package org.clulab.learning -import org.clulab.utils.{Files,MathUtils} -import org.slf4j.LoggerFactory import de.bwaldvogel.liblinear._ import org.clulab.struct.Counter import org.clulab.struct.Lexicon +import org.clulab.utils.{Files,MathUtils} +import org.slf4j.LoggerFactory + +import java.io._ +import scala.collection.mutable import scala.collection.mutable.ArrayBuffer +import scala.util.Using + import LiblinearClassifier.logger -import scala.collection.mutable -import java.io._ /** * Wrapper for liblinear classifiers, which includes LR and linear SVM @@ -324,10 +327,10 @@ object LiblinearClassifier { val logger = LoggerFactory.getLogger(classOf[LiblinearClassifier[String, String]]) def loadFrom[L, F](fileName:String):LiblinearClassifier[L, F] = { - val r = new BufferedReader(new FileReader(fileName)) - val c = loadFrom[L, F](r) - r.close() - c + Using.resource(new BufferedReader(new FileReader(fileName))) { r => + val c = loadFrom[L, F](r) + c + } } def loadFrom[L, F](r:Reader):LiblinearClassifier[L, F] = { diff --git a/main/src/main/scala/org/clulab/learning/LibLinearRegression.scala b/main/src/main/scala/org/clulab/learning/LibLinearRegression.scala index 758a6f518..47218621d 100644 --- a/main/src/main/scala/org/clulab/learning/LibLinearRegression.scala +++ b/main/src/main/scala/org/clulab/learning/LibLinearRegression.scala @@ -1,13 +1,16 @@ package org.clulab.learning +import de.bwaldvogel.liblinear._ import org.clulab.utils.Files import org.slf4j.LoggerFactory -import de.bwaldvogel.liblinear._ + +import java.io._ import org.clulab.struct.Counter import org.clulab.struct.Lexicon import scala.collection.mutable.ArrayBuffer +import scala.util.Using + import LiblinearRegression.logger -import java.io._ /** * Wrapper for liblinear regression, including LR and linear SVM @@ -253,10 +256,10 @@ object LiblinearRegression { val logger = LoggerFactory.getLogger(this.getClass) def loadFrom[F](fileName:String):LiblinearRegression[F] = { - val r = new BufferedReader(new FileReader(fileName)) - val c = loadFrom[F](r) - r.close() - c + Using.resource(new BufferedReader(new FileReader(fileName))) { r => + val c = loadFrom[F](r) + c + } } def loadFrom[F](r:Reader): LiblinearRegression[F] = { diff --git a/main/src/main/scala/org/clulab/learning/PerceptronClassifier.scala b/main/src/main/scala/org/clulab/learning/PerceptronClassifier.scala index b8934259d..a24b938c4 100644 --- a/main/src/main/scala/org/clulab/learning/PerceptronClassifier.scala +++ b/main/src/main/scala/org/clulab/learning/PerceptronClassifier.scala @@ -1,16 +1,19 @@ package org.clulab.learning import org.clulab.struct.Counter -import java.io._ -import org.slf4j.LoggerFactory -import java.util.Properties import org.clulab.utils.{Files, MathUtils, StringUtils} import org.clulab.struct.Lexicon import org.clulab.struct.Counters._ -import PerceptronClassifier.logger -import scala.collection.mutable.ArrayBuffer +import org.slf4j.LoggerFactory + +import java.io._ +import java.util.Properties import scala.Serializable +import scala.collection.mutable.ArrayBuffer import scala.util.Random +import scala.util.Using + +import PerceptronClassifier.logger /** * Multiclass perceptron classifier, in primal mode @@ -101,7 +104,7 @@ class PerceptronClassifier[L, F] ( // compute the scores for all class labels val predictions = new ArrayBuffer[(Int, Double)](labelLexicon.size) for(i <- 0 until labelLexicon.size) { - predictions += new Tuple2(i, dotProduct(weights(i), datum)) + predictions += ((i, dotProduct(weights(i), datum))) } // sort predictions in descending order of scores @@ -268,10 +271,10 @@ object PerceptronClassifier { val logger = LoggerFactory.getLogger(classOf[PerceptronClassifier[String, String]]) def loadFrom[L, F](fileName:String):PerceptronClassifier[L, F] = { - val r = new BufferedReader(new FileReader(fileName)) - val c = loadFrom[L, F](r) - r.close() - c + Using.resource(new BufferedReader(new FileReader(fileName))) { r => + val c = loadFrom[L, F](r) + c + } } def loadFrom[L, F](r:Reader):PerceptronClassifier[L, F] = { diff --git a/main/src/main/scala/org/clulab/learning/RFClassifier.scala b/main/src/main/scala/org/clulab/learning/RFClassifier.scala index ba67950a8..60b469115 100644 --- a/main/src/main/scala/org/clulab/learning/RFClassifier.scala +++ b/main/src/main/scala/org/clulab/learning/RFClassifier.scala @@ -302,7 +302,7 @@ class RFClassifier[L, F](numTrees:Int = 100, for(f <- features) { contingencyTables(f) = new Array[(Counter[Int], Counter[Int])](job.featureThresholds(f).length) for(i <- contingencyTables(f).indices) { - contingencyTables(f)(i) = new Tuple2(new Counter[Int], new Counter[Int]) + contingencyTables(f)(i) = (new Counter[Int], new Counter[Int]) } } @@ -417,7 +417,7 @@ class RFClassifier[L, F](numTrees:Int = 100, val newActiveNodes = new mutable.HashSet[(Int, Double)]() newActiveNodes ++= job.activeNodes - newActiveNodes += new Tuple2(best.get.feature, best.get.threshold) + newActiveNodes += ((best.get.feature, best.get.threshold)) val newActiveNodesSet = newActiveNodes.toSet new RFNonTerminal(best.get.feature, best.get.threshold, buildTree(mkLeftJob(job, best.get.feature, best.get.threshold, best.get.leftChildValue, newActiveNodesSet)), @@ -724,7 +724,7 @@ class RFJob[L, F]( val labels = new ArrayBuffer[(Int, Int)] // gold, pred for(i <- oobIndices.indices) { val prediction = tree.apply(dataset.featuresCounter(oobIndices(i))).sorted.head._1 - labels += new Tuple2(dataset.labels(oobIndices(i)), prediction) + labels += ((dataset.labels(oobIndices(i)), prediction)) } if(nilLabel.isEmpty) accuracy(labels) diff --git a/main/src/main/scala/org/clulab/learning/RankingDataset.scala b/main/src/main/scala/org/clulab/learning/RankingDataset.scala index 2ae83025c..0e7e9b154 100644 --- a/main/src/main/scala/org/clulab/learning/RankingDataset.scala +++ b/main/src/main/scala/org/clulab/learning/RankingDataset.scala @@ -1,16 +1,16 @@ package org.clulab.learning -import java.util.zip.GZIPInputStream -import java.io.{BufferedInputStream, FileInputStream, FileOutputStream, FileWriter, ObjectInputStream, ObjectOutputStream, PrintWriter} - -import org.slf4j.LoggerFactory - -import scala.collection.mutable.{ArrayBuffer, ListBuffer} -import scala.io.{BufferedSource, Source} import org.clulab.struct.Counter import org.clulab.struct.Lexicon import org.clulab.utils.Files import org.clulab.utils.Serializer +import org.slf4j.LoggerFactory + +import java.io.{BufferedInputStream, FileInputStream, FileOutputStream, FileWriter, ObjectInputStream, ObjectOutputStream, PrintWriter} +import java.util.zip.GZIPInputStream +import scala.collection.mutable.{ArrayBuffer, ListBuffer} +import scala.io.{BufferedSource, Source} +import scala.util.Using /** * Parent class for all datasets used for ranking problems @@ -63,7 +63,7 @@ class BVFRankingDataset[F] extends RankingDataset[F] { for(d <- queryDatums) { d match { case bd:BVFDatum[Int, F] => { - b += new Tuple2[Int, Array[Int]](bd.label, featuresToArray(bd.features)) + b += ((bd.label, featuresToArray(bd.features))) } case _ => throw new RuntimeException("ERROR: you cannot add a non BVFDatum to a BVFRankingDataset!") } @@ -155,10 +155,10 @@ class RVFRankingDataset[F] extends BVFRankingDataset[F] with FeatureTraversable[ d match { case rd:RVFDatum[Int, F] => { val fvs = featuresCounterToArray(d.featuresCounter) - b += new Tuple3[Int, Array[Int], Array[Double]]( + b += (( rd.label, fvs.map(fv => fv._1), - fvs.map(fv => fv._2)) + fvs.map(fv => fv._2))) } case _ => throw new RuntimeException("ERROR: you cannot add a non RVFDatum to a RVFRankingDataset!") } @@ -169,7 +169,7 @@ class RVFRankingDataset[F] extends BVFRankingDataset[F] with FeatureTraversable[ protected def featuresCounterToArray(fs:Counter[F]):Array[(Int, Double)] = { val fb = new ListBuffer[(Int, Double)] for(f <- fs.keySet) { - fb += new Tuple2[Int, Double](featureLexicon.add(f), fs.getCount(f)) + fb += ((featureLexicon.add(f), fs.getCount(f))) } fb.sortBy(_._1).toArray } @@ -451,29 +451,29 @@ object RVFRankingDataset { featureLexicon:Lexicon[String], fn:String): Unit = { var qid = 0 - val os = new PrintWriter(new FileWriter(fn)) - for(query <- queries) { - qid += 1 - for(datum <- query) { - os.print(datum.label) - os.print(s" qid:$qid") - val fs = new ListBuffer[(Int, Double)] - val c = datum.featuresCounter - for(k <- c.keySet) { - val fi = featureLexicon.get(k) - if(fi.isDefined) { - // logger.debug(s"Feature [$k] converted to index ${fi.get + 1}") - fs += new Tuple2(fi.get + 1, c.getCount(k)) + Using.resource(new PrintWriter(fn)) { os => + for (query <- queries) { + qid += 1 + for (datum <- query) { + os.print(datum.label) + os.print(s" qid:$qid") + val fs = new ListBuffer[(Int, Double)] + val c = datum.featuresCounter + for (k <- c.keySet) { + val fi = featureLexicon.get(k) + if (fi.isDefined) { + // logger.debug(s"Feature [$k] converted to index ${fi.get + 1}") + fs += ((fi.get + 1, c.getCount(k))) + } } + val fss = fs.toList.sortBy(_._1) + for (t <- fss) { + os.print(s" ${t._1}:${t._2}") + } + os.println() } - val fss = fs.toList.sortBy(_._1) - for(t <- fss) { - os.print(s" ${t._1}:${t._2}") - } - os.println() } } - os.close() } def loadFrom[F](fileName:String):RVFRankingDataset[F] = { @@ -499,11 +499,11 @@ class RVFKRankingDataset[F] extends RVFRankingDataset[F] { d match { case rd:RVFKDatum[Int, F] => { val fvs = featuresCounterToArray(d.featuresCounter) - b += new Tuple4[Int, Array[Int], Array[Double], String]( + b += (( rd.label, fvs.map(fv => fv._1), fvs.map(fv => fv._2), - rd.kernel) + rd.kernel)) } case _ => throw new RuntimeException("ERROR: you cannot add a non RVFKDatum to a RVFKRankingDataset!") } diff --git a/main/src/main/scala/org/clulab/learning/RegDataset.scala b/main/src/main/scala/org/clulab/learning/RegDataset.scala index 07dfea217..85e1f6465 100644 --- a/main/src/main/scala/org/clulab/learning/RegDataset.scala +++ b/main/src/main/scala/org/clulab/learning/RegDataset.scala @@ -1,19 +1,19 @@ package org.clulab.learning -import scala.collection.mutable -import scala.collection.mutable.{ArrayBuffer, ListBuffer} import org.clulab.struct.Counter import org.clulab.struct.Lexicon +import org.clulab.utils.Files +import org.slf4j.LoggerFactory -import scala.io.{BufferedSource, Source} +import java.io.PrintWriter import java.util.zip.GZIPInputStream -import java.io.{BufferedInputStream, FileInputStream, FileWriter, PrintWriter} +import scala.collection.mutable +import scala.collection.mutable.{ArrayBuffer, ListBuffer} +import scala.io.{BufferedSource, Source} +import scala.reflect.ClassTag +import scala.util.Using -import org.slf4j.LoggerFactory import RVFRegDataset._ -import org.clulab.utils.Files - -import scala.reflect.ClassTag /** * Parent class for regression datasets. For classification, see [[Dataset]]. @@ -122,7 +122,7 @@ class BVFRegDataset[F: ClassTag] ( // sort all features in descending order of their IG val fb = new ListBuffer[(Int, Double)] - for(f <- igs.keySet) fb += new Tuple2(f, igs.get(f).get.ig(total)) + for(f <- igs.keySet) fb += ((f, igs.get(f).get.ig(total))) val sortedFeats = fb.sortBy(- _._2).toArray // keep the top pctToKeep @@ -245,7 +245,7 @@ class RVFRegDataset[F: ClassTag] ( private def featuresCounterToArray(fs:Counter[F]):Array[(Int, Double)] = { val fb = new ListBuffer[(Int, Double)] for(f <- fs.keySet) { - fb += new Tuple2[Int, Double](featureLexicon.add(f), fs.getCount(f)) + fb += ((featureLexicon.add(f), fs.getCount(f))) } fb.sortBy(_._1).toArray } @@ -450,25 +450,25 @@ object RVFRegDataset { featureLexicon:Lexicon[String], fn:String): Unit = { - val os = new PrintWriter(new FileWriter(fn)) - for(datum <- datums) { - os.print(datum.label) - val fs = new ListBuffer[(Int, Double)] - val c = datum.featuresCounter - for(k <- c.keySet) { - val fi = featureLexicon.get(k) - if(fi.isDefined) { - // logger.debug(s"Feature [$k] converted to index ${fi.get + 1}") - fs += new Tuple2(fi.get + 1, c.getCount(k)) + Using.resource(new PrintWriter(fn)) { os => + for (datum <- datums) { + os.print(datum.label) + val fs = new ListBuffer[(Int, Double)] + val c = datum.featuresCounter + for (k <- c.keySet) { + val fi = featureLexicon.get(k) + if (fi.isDefined) { + // logger.debug(s"Feature [$k] converted to index ${fi.get + 1}") + fs += ((fi.get + 1, c.getCount(k))) + } } + val fss = fs.toList.sortBy(_._1) + for (t <- fss) { + os.print(s" ${t._1}:${t._2}") + } + os.println() } - val fss = fs.toList.sortBy(_._1) - for(t <- fss) { - os.print(s" ${t._1}:${t._2}") - } - os.println() } - os.close() } def mkDatumsFromSvmLightResource(path: String): Iterable[Datum[Double, String]] = { diff --git a/main/src/main/scala/org/clulab/learning/Regression.scala b/main/src/main/scala/org/clulab/learning/Regression.scala index 348743972..dc46f1a73 100644 --- a/main/src/main/scala/org/clulab/learning/Regression.scala +++ b/main/src/main/scala/org/clulab/learning/Regression.scala @@ -1,9 +1,10 @@ package org.clulab.learning -import java.io._ - import org.clulab.learning.Datasets._ +import java.io._ +import scala.util.Using + /** * Trait for regression * Adapted from Classifier trait @@ -38,9 +39,9 @@ trait Regression[F] { /** Saves the current model to a file */ def saveTo(fileName:String): Unit = { - val bw = new BufferedWriter(new FileWriter(fileName)) - saveTo(bw) - bw.close() + Using.resource(new BufferedWriter(new FileWriter(fileName))) { bw => + saveTo(bw) + } } /** Saves to writer. Does NOT close the writer */ diff --git a/main/src/main/scala/org/clulab/learning/SVMRankingClassifier.scala b/main/src/main/scala/org/clulab/learning/SVMRankingClassifier.scala index 0470cbdc7..8300d1c43 100644 --- a/main/src/main/scala/org/clulab/learning/SVMRankingClassifier.scala +++ b/main/src/main/scala/org/clulab/learning/SVMRankingClassifier.scala @@ -1,18 +1,19 @@ package org.clulab.learning -import java.io._ -import java.util.Properties - +import org.clulab.struct.{Counter, Counters, Lexicon} +import org.clulab.utils.Serializer +import org.clulab.utils.StringUtils import org.slf4j.LoggerFactory +import java.io._ +import java.util.Properties import scala.Serializable import scala.collection.mutable.ArrayBuffer import scala.io.Source import scala.sys.process._ -import org.clulab.struct.{Counter, Counters, Lexicon} -import org.clulab.utils.StringUtils +import scala.util.Using + import SVMRankingClassifier.logger -import org.clulab.utils.Serializer /** * Wrapper for SVMrank: trains using svm_rank_learn but predicts using native Scala code @@ -51,9 +52,9 @@ class SVMRankingClassifier[F] ( def train(dataset:RankingDataset[F], spans:Option[Iterable[(Int, Int)]] = None): Unit = { val trainPath = workingDir + File.separator + trainFile - val trainWriter = new PrintWriter(trainPath) - val n = mkTrainFile(trainWriter, dataset, spans) - trainWriter.close() + val n = Using.resource(new PrintWriter(trainPath)) { trainWriter => + mkTrainFile(trainWriter, dataset, spans) + } logger.debug("Created training file: " + trainPath) val cRank = cLight * n @@ -230,7 +231,7 @@ class SVMRankingClassifier[F] ( private def mkFullFold(size:Int): Iterable[(Int, Int)] = { val folds = new Array[(Int, Int)](1) - folds(0) = new Tuple2(0, size) + folds(0) = (0, size) folds } @@ -338,38 +339,40 @@ class SVMRankingClassifier[F] ( if (debugFile.nonEmpty) { var features = new ArrayBuffer[(String, Int, Double)] - val pw = new PrintWriter(debugFile) - for(f <- featureLexicon.get.keySet) { - val idx = featureLexicon.get.get(f) - idx match { - case Some(x) => if (x < weights.get.size) { features.append ( (f.toString, featureLexicon.get.get(f).getOrElse(-1), weights.get(x)) ) } - case _ => + Using.resource(new PrintWriter(debugFile)) { pw => + for (f <- featureLexicon.get.keySet) { + val idx = featureLexicon.get.get(f) + idx match { + case Some(x) => if (x < weights.get.size) { + features.append((f.toString, featureLexicon.get.get(f).getOrElse(-1), weights.get(x))) + } + case _ => + } } - } - // Sort features - features = features.sortBy(- _._3) + // Sort features + features = features.sortBy(-_._3) - // Output features - for (i <- 0 until features.size) { - val feature = features(i) - var featureString = feature._1 - for (j <- 0 until (20 - featureString.size)) featureString += " " // Make featureString a constant length for formatting - pw.println (featureString + " \t weight: " + feature._3) - } + // Output features + for (i <- 0 until features.size) { + val feature = features(i) + var featureString = feature._1 + for (j <- 0 until (20 - featureString.size)) featureString += " " // Make featureString a constant length for formatting + pw.println(featureString + " \t weight: " + feature._3) + } - pw.println ("") - pw.println("Weights:") - var first = true - for(i <- 0 until weights.get.size) { - if(weights.get(i) != 0.0) { - if(! first) pw.print(" ") - pw.print(s"$i:${weights.get(i)}") - first = false + pw.println("") + pw.println("Weights:") + var first = true + for (i <- 0 until weights.get.size) { + if (weights.get(i) != 0.0) { + if (!first) pw.print(" ") + pw.print(s"$i:${weights.get(i)}") + first = false + } } + pw.println() } - pw.println() - pw.close() } } } diff --git a/main/src/main/scala/org/clulab/numeric/EvalTimeNorm.scala b/main/src/main/scala/org/clulab/numeric/EvalTimeNorm.scala index 9dfcf8df2..6d8b3209c 100644 --- a/main/src/main/scala/org/clulab/numeric/EvalTimeNorm.scala +++ b/main/src/main/scala/org/clulab/numeric/EvalTimeNorm.scala @@ -3,10 +3,10 @@ package org.clulab.numeric import org.clulab.numeric.mentions.Norm import org.clulab.processors.Processor import org.clulab.processors.clu.BalaurProcessor -import org.clulab.utils.Closer.AutoCloser import java.nio.charset.StandardCharsets import scala.io.Source +import scala.util.Using object EvalTimeNorm { @@ -29,7 +29,7 @@ object EvalTimeNorm { val gold = goldTimex(docId).toSet val resource = s"$timeNormEvalDir/$docId/$docId" val docStream = getClass.getResourceAsStream(resource) - val docText = Source.fromInputStream(docStream)(StandardCharsets.UTF_8).autoClose { source => + val docText = Using.resource(Source.fromInputStream(docStream)(StandardCharsets.UTF_8)) { source => // This ensures that line endings are LF. FileUtils.getTextFromResource() will not. source.getLines().mkString("\n") } diff --git a/main/src/main/scala/org/clulab/numeric/ModifierNormalizer.scala b/main/src/main/scala/org/clulab/numeric/ModifierNormalizer.scala index 8ae195852..4736c5791 100644 --- a/main/src/main/scala/org/clulab/numeric/ModifierNormalizer.scala +++ b/main/src/main/scala/org/clulab/numeric/ModifierNormalizer.scala @@ -2,13 +2,6 @@ package org.clulab.numeric import java.time.{Month, YearMonth} -import org.clulab.sequences.CommentedStandardKbSource -import org.clulab.utils.Closer.AutoCloser -import org.clulab.utils.Sourcer - -import scala.collection.mutable -import scala.io.Source - object ModifierNormalizer { val APPROX_SYMBOL = "[APPROX]" diff --git a/main/src/main/scala/org/clulab/numeric/NumericEntityRecognizer.scala b/main/src/main/scala/org/clulab/numeric/NumericEntityRecognizer.scala index 8f5ddb5b7..3d5976a7d 100644 --- a/main/src/main/scala/org/clulab/numeric/NumericEntityRecognizer.scala +++ b/main/src/main/scala/org/clulab/numeric/NumericEntityRecognizer.scala @@ -30,6 +30,7 @@ class NumericEntityRecognizer protected (val lexiconNer: LexiconNER, val actions // this needs to happen in place, otherwise Odin does not see these labels // we will restore the original Sentence.entities at the end in `extractFrom` sent.entities = Some(labels) + // println(s"ENTITIES: ${sent.entities.get.mkString(" ")}") } originalEntities @@ -65,6 +66,7 @@ object NumericEntityRecognizer { // For the sake of SeasonNormalizer, this does have a leading /. val seasonPath = "/org/clulab/numeric/SEASON.tsv" val unitNormalizerPath = "/org/clulab/numeric/MEASUREMENT-UNIT.tsv" + val weekPath = "/org/clulab/numeric/WEEK.tsv" // this matches essential dictionaries such as month names def mkLexiconNer(seasonsPath: String): LexiconNER = { @@ -101,11 +103,12 @@ object NumericEntityRecognizer { ExtractorEngine(rules, actions, actions.cleanupAction, ruleDir = Some(ruleDir)) } - def apply(seasonPath: String = seasonPath, unitNormalizerPath: String = unitNormalizerPath): NumericEntityRecognizer = { + def apply(seasonPath: String = seasonPath, unitNormalizerPath: String = unitNormalizerPath, weekPath: String = weekPath): NumericEntityRecognizer = { val lexiconNer = mkLexiconNer(seasonPath) val seasonNormalizer = new SeasonNormalizer(seasonPath) val unitNormalizer = new UnitNormalizer(unitNormalizerPath) - val numericActions = new NumericActions(seasonNormalizer, unitNormalizer) + val weekNormalizer = new WeekNormalizer(weekPath) + val numericActions = new NumericActions(seasonNormalizer, unitNormalizer, weekNormalizer) val extractorEngine = mkExtractor(numericActions) new NumericEntityRecognizer(lexiconNer, numericActions, extractorEngine) diff --git a/main/src/main/scala/org/clulab/numeric/SeasonNormalizer.scala b/main/src/main/scala/org/clulab/numeric/SeasonNormalizer.scala index dbb8dc2ed..c0cbf0ffc 100644 --- a/main/src/main/scala/org/clulab/numeric/SeasonNormalizer.scala +++ b/main/src/main/scala/org/clulab/numeric/SeasonNormalizer.scala @@ -3,11 +3,11 @@ package org.clulab.numeric import java.io.File import org.clulab.sequences.CommentedStandardKbSource -import org.clulab.utils.Closer.AutoCloser import org.clulab.utils.Sourcer import scala.collection.mutable import scala.io.Source +import scala.util.Using class SeasonNormalizer(seasonsPath: String) { val normMapper = SeasonNormalizer.readNormsFromResource(seasonsPath) @@ -16,6 +16,7 @@ class SeasonNormalizer(seasonsPath: String) { def adjustYearRange(seasonRange: SeasonRange, year: Seq[String]): (Seq[String], Seq[String]) = { val startMonthValue = seasonRange.startMonth.head.mkString(" ").toInt val endMonthValue = seasonRange.endMonth.head.mkString(" ").toInt + //println(s"startMonth = $startMonthValue; endMonth = $endMonthValue; year = ${year.mkString}") endMonthValue < startMonthValue match { case true if 12 - startMonthValue >= endMonthValue => val yearEnd = year.mkString.toInt + 1 @@ -40,9 +41,9 @@ object SeasonNormalizer { val customResourcePath = new File(NumericEntityRecognizer.resourceDir, path) if (customResourcePath.exists) - Sourcer.sourceFromFile(customResourcePath).autoClose(readNormsFromSource) + Using.resource(Sourcer.sourceFromFile(customResourcePath))(readNormsFromSource) else - Sourcer.sourceFromResource(path).autoClose(readNormsFromSource) + Using.resource(Sourcer.sourceFromResource(path))(readNormsFromSource) } def readNormsFromSource(source: Source): Map[String, SeasonRange] = { diff --git a/main/src/main/scala/org/clulab/numeric/TempEvalFormatter.scala b/main/src/main/scala/org/clulab/numeric/TempEvalFormatter.scala index f4c6d34c7..25e48b2cd 100644 --- a/main/src/main/scala/org/clulab/numeric/TempEvalFormatter.scala +++ b/main/src/main/scala/org/clulab/numeric/TempEvalFormatter.scala @@ -51,7 +51,7 @@ object TempEvalFormatter { } } - private def convertLiteralMonth(s: String): Int = { + def convertLiteralMonth(s: String): Int = { val v = s.toLowerCase() if(v.startsWith("jan")) 1 diff --git a/main/src/main/scala/org/clulab/numeric/UnitNormalizer.scala b/main/src/main/scala/org/clulab/numeric/UnitNormalizer.scala index d891fa057..05ea12710 100644 --- a/main/src/main/scala/org/clulab/numeric/UnitNormalizer.scala +++ b/main/src/main/scala/org/clulab/numeric/UnitNormalizer.scala @@ -1,11 +1,11 @@ package org.clulab.numeric import org.clulab.sequences.CommentedStandardKbSource -import org.clulab.utils.Closer.AutoCloser import org.clulab.utils.Sourcer import scala.collection.mutable import scala.io.Source +import scala.util.Using case class NormAndUnitClass(norm: String, unitClassOpt: Option[String]) @@ -43,7 +43,7 @@ object UnitNormalizer { private val normMapper = readNormsFromResource("/org/clulab/numeric/MEASUREMENT-UNIT.tsv") def readNormsFromResource(path: String): Map[String, NormAndUnitClass] = - Sourcer.sourceFromResource(path).autoClose(readNormsFromSource) + Using.resource(Sourcer.sourceFromResource(path))(readNormsFromSource) def readNormsFromSource(source: Source): Map[String, NormAndUnitClass] = { val norms = new mutable.HashMap[String, NormAndUnitClass]() diff --git a/main/src/main/scala/org/clulab/numeric/WeekNormalizer.scala b/main/src/main/scala/org/clulab/numeric/WeekNormalizer.scala new file mode 100644 index 000000000..c9ba71218 --- /dev/null +++ b/main/src/main/scala/org/clulab/numeric/WeekNormalizer.scala @@ -0,0 +1,62 @@ +package org.clulab.numeric + +import java.io.File +import java.time.{Month, YearMonth} + +import org.clulab.sequences.CommentedStandardKbSource +import org.clulab.utils.Sourcer + +import scala.collection.mutable +import scala.io.Source +import scala.util.Using + +class WeekNormalizer(weekPath: String) { + val normMapper = WeekNormalizer.readNormsFromResource(weekPath) + + /** Normalizes seasons */ + def norm(text: Seq[String]): Option[WeekRange] = { + val week = text.mkString(" ").toLowerCase() + + normMapper.get(week) + } +} + +object WeekNormalizer { + + def readNormsFromResource(path: String): Map[String, WeekRange] = { + val customResourcePath = new File(NumericEntityRecognizer.resourceDir, path) + + if (customResourcePath.exists) + Using.resource(Sourcer.sourceFromFile(customResourcePath))(readNormsFromSource) + else + Using.resource(Sourcer.sourceFromResource(path))(readNormsFromSource) + } + + def readNormsFromSource(source: Source): Map[String, WeekRange] = { + val norms = new mutable.HashMap[String, WeekRange]() + + CommentedStandardKbSource.read(source) { (week, normOpt, unitClassOpt) => + assert(normOpt.isDefined) // We're insisting on this. + + val norm = normOpt.get.split("--").map(_.trim) + val (start, end) = norm match { + case Array(start, end) => (start, end) + case _ => throw new RuntimeException(s"ERROR: incorrect date range in week file") + } + val startDay = getDay(start) + val endDay = getDay(end) + norms += week -> WeekRange(startDay, endDay) + } + norms.toMap + } + + private def getDay(date: String): Option[Seq[String]] = { + date.split("-") match { + case Array(_, _, day) => Some(Seq(day)) + case _ => throw new RuntimeException(s"ERROR: incorrect date value in week file: $date") + } + } +} + +case class WeekRange(startDay: Option[Seq[String]], + endDay: Option[Seq[String]]) \ No newline at end of file diff --git a/main/src/main/scala/org/clulab/numeric/actions/NumericActions.scala b/main/src/main/scala/org/clulab/numeric/actions/NumericActions.scala index b5f5d7142..a28a7e11e 100644 --- a/main/src/main/scala/org/clulab/numeric/actions/NumericActions.scala +++ b/main/src/main/scala/org/clulab/numeric/actions/NumericActions.scala @@ -1,13 +1,14 @@ package org.clulab.numeric.actions -import org.clulab.numeric.{SeasonNormalizer, UnitNormalizer} +import org.clulab.numeric.{SeasonNormalizer, UnitNormalizer, WeekNormalizer} import org.clulab.odin.{Actions, Mention, State} import org.clulab.numeric.mentions._ import org.clulab.scala.WrappedArrayBuffer._ +import java.util.regex.Pattern import scala.collection.mutable.ArrayBuffer -class NumericActions(seasonNormalizer: SeasonNormalizer, unitNormalizer: UnitNormalizer) extends Actions { +class NumericActions(seasonNormalizer: SeasonNormalizer, unitNormalizer: UnitNormalizer, weekNormalizer: WeekNormalizer) extends Actions { // // local actions // @@ -98,6 +99,11 @@ class NumericActions(seasonNormalizer: SeasonNormalizer, unitNormalizer: UnitNor convert(mentions, toDateRangeMentionWithUntilRef, "toDateRangeMentionWithUntilRef") } + /** Constructs a DateRangeMention from a token pattern */ + def mkDateRangeMentionWithWeek(mentions: Seq[Mention], state: State): Seq[Mention] = { + convert(mentions, toDateRangeMentionWithWeek(weekNormalizer), "toDateRangeMentionWithWeek") + } + /** Constructs a DateRangeMention from a token pattern */ def mkDateUnboundRangeMentionBefore(mentions: Seq[Mention], state: State): Seq[Mention] = { convert(mentions, toDateUnboundRangeMentionBefore, "toDateUnboundRangeMentionBefore") @@ -225,16 +231,48 @@ class NumericActions(seasonNormalizer: SeasonNormalizer, unitNormalizer: UnitNor } } - val r1 = keepLongestMentions(mentions) + val r1 = postprocessNumericEntities(mentions) + val r2 = keepLongestMentions(r1) if(false) { println("mentions after cleanup:") - for (m <- r1) { + for (m <- r2) { println("\t" + m.text) } println() } - r1 + r2 + } + + /** filter out season homonyms (fall, spring) **/ + def postprocessNumericEntities(mentions: Seq[Mention]): Seq[Mention] = { + + def prevWordsMatch(words: Array[String], wordIndex: Int): Boolean = { + val prevWords = words.slice(wordIndex - 2, wordIndex).map(_.toLowerCase) + + prevWords.exists(NumericActions.preSeasons) || + prevWords.containsSlice(NumericActions.inThe) + } + + def contextWordsMatch(words: Array[String], wordIndex: Int): Boolean = { + val window = 5 + val contextWords = words.slice(wordIndex - window, wordIndex + window).map(_.toLowerCase) + + contextWords.exists(NumericActions.seasons) || + contextWords.exists(NumericActions.yearPattern.matcher(_).matches) + } + + val (seasonMentions, otherMentions) = mentions.partition(m => m.foundBy.contains("season")) + val (springFall, otherSeasons) = seasonMentions.partition(m => m.text.equalsIgnoreCase("spring") || m.text.equalsIgnoreCase("fall")) + val trueSeasons = springFall.filter { m => + m.tags.get.head.startsWith("NN") && { + val words = m.sentenceObj.words + val wordIndex = m.tokenInterval.start + + prevWordsMatch(words, wordIndex) || contextWordsMatch(words, wordIndex) + } + } + trueSeasons ++ otherSeasons ++ otherMentions } /** Keeps a date (or date range) mention only if it is not contained in another */ @@ -253,6 +291,14 @@ class NumericActions(seasonNormalizer: SeasonNormalizer, unitNormalizer: UnitNor } object NumericActions { + val seasons: Set[String] = Set("spring", "summer", "fall", "autumn", "winter") + // Words that typically precede a season that might distinguish it from a similar verb + val preSeasons: Set[String] = Set("this", "last", "every") + // A common introduction to a season + val inThe: Array[String] = Array("in", "the") + // Match a 1 to 4 digit year + val yearPattern = Pattern.compile("[0-9]{2}|[0-9]{4}") + def isNumeric(m: Mention): Boolean = { m.isInstanceOf[DateMention] || m.isInstanceOf[DateRangeMention] || diff --git a/main/src/main/scala/org/clulab/numeric/mentions/package.scala b/main/src/main/scala/org/clulab/numeric/mentions/package.scala index 996a834f4..d9ecd003e 100644 --- a/main/src/main/scala/org/clulab/numeric/mentions/package.scala +++ b/main/src/main/scala/org/clulab/numeric/mentions/package.scala @@ -1,9 +1,11 @@ package org.clulab.numeric +import de.jollyday.config.FixedWeekdayInMonth import org.clulab.odin.{EventMention, Mention, RelationMention, TextBoundMention} import org.clulab.struct.Interval import java.util.regex.Pattern +import java.time.{Month, YearMonth} package object mentions { val RANGE_SEP = " -- " @@ -325,10 +327,30 @@ package object mentions { case _ => (None, None) } + val startDate = TempEvalFormatter.mkDate(seasonNorm.get.startDay, seasonNorm.get.startMonth, yearStart) + val endDate = TempEvalFormatter.mkDate(seasonNorm.get.endDay, seasonNorm.get.endMonth, yearEnd) + val dateRangeMention = DateRangeMention(m, startDate, endDate) + + dateRangeMention + + case m => + throw new RuntimeException(s"ERROR: cannot convert mention of type ${m.getClass.toString} to DateRangeMention!") + } + + def toDateRangeMentionWithWeek(weekNormalizer: WeekNormalizer)(mention: Mention): DateRangeMention = mention match { + case m: DateRangeMention => m + + case m: RelationMention => + val weekNorm = getWeekRange(weekNormalizer)("week", m) + if (weekNorm.isEmpty) + throw new RuntimeException(s"ERROR: could not find argument week in mention ${m.raw.mkString(" ")}!") + + val month = getArgWords("month", m) + DateRangeMention( m, - TempEvalFormatter.mkDate(seasonNorm.get.startDay, seasonNorm.get.startMonth,yearStart), - TempEvalFormatter.mkDate(seasonNorm.get.endDay, seasonNorm.get.endMonth, yearEnd) + TempEvalFormatter.mkDate(weekNorm.get.startDay, month, None), + TempEvalFormatter.mkDate(weekNorm.get.endDay, month, None) ) case m => @@ -816,7 +838,7 @@ package object mentions { val month = m.group(2) val day = m.group(3) - Tuple3(year, month, day) + (year, month, day) } else { throw new RuntimeException(s"ERROR: cannot extract year/month/day from date $v!") } @@ -829,7 +851,7 @@ package object mentions { val month = m.group(2) val day = m.group(3) - Tuple3(year, month, day) + (year, month, day) } else { throw new RuntimeException(s"ERROR: cannot extract year/month/day from date $v!") } @@ -842,7 +864,7 @@ package object mentions { val month = m.group(2) val year = m.group(3) - Tuple3(year, month, day) + (year, month, day) } else { throw new RuntimeException(s"ERROR: cannot extract year/month/day from date $v!") } @@ -854,7 +876,7 @@ package object mentions { val month = m.group(1) val year = m.group(2) - Tuple2(year, month) + (year, month) } else { throw new RuntimeException(s"ERROR: cannot extract year/month/day from date $v!") } @@ -867,7 +889,7 @@ package object mentions { val year = m.group(1) val month = m.group(2) - Tuple2(year, month) + (year, month) } else { throw new RuntimeException(s"ERROR: cannot extract year/month/day from date $v!") } @@ -879,7 +901,7 @@ package object mentions { val year = m.group(1) val month = m.group(2) - Tuple2(year, month) + (year, month) } else { throw new RuntimeException(s"ERROR: cannot extract year/month/day from date $v!") } @@ -892,6 +914,33 @@ package object mentions { else seasonNormalizer.norm(wordsOpt.get) } + private def getWeekRange(weekNormalizer: WeekNormalizer)(argName: String, m:Mention): Option[WeekRange] = { + val wordsOpt = getArgWords(argName, m) + + if (wordsOpt.isEmpty) None + else if (wordsOpt.get.mkString(" ").toLowerCase().equals("last week")) {getLastWeekRange(m)} + else if (wordsOpt.get.mkString(" ").toLowerCase().equals("last two weeks")) {getLastTwoWeeksRange(m)} + else weekNormalizer.norm(wordsOpt.get) + } + + private def getLastWeekRange(m:Mention): Option[WeekRange] = { + val month = getArgWords("month", m) + val modifiedMonth = TempEvalFormatter.convertLiteralMonth(month.get.mkString("")) + val monthObj = Month.of(modifiedMonth) + val lastDay = monthObj.length(false) + + Some(WeekRange(startDay = Some(Seq((lastDay - 6).toString)), endDay = Some(Seq(lastDay.toString)))) + } + + private def getLastTwoWeeksRange(m:Mention): Option[WeekRange] = { + val month = getArgWords("month", m) + val modifiedMonth = TempEvalFormatter.convertLiteralMonth(month.get.mkString("")) + val monthObj = Month.of(modifiedMonth) + val lastDay = monthObj.length(false) + + Some(WeekRange(startDay = Some(Seq((lastDay - 13).toString)), endDay = Some(Seq(lastDay.toString)))) + } + private def getHoliday(holiday: Seq[String], year: Option[Seq[String]]): (Option[Seq[String]], Option[Seq[String]]) = { val dayMonthOpt = HolidayNormalizer.norm(holiday, year) dayMonthOpt match { diff --git a/main/src/main/scala/org/clulab/numeric/package.scala b/main/src/main/scala/org/clulab/numeric/package.scala index 4bcd11ab4..70559d0f9 100644 --- a/main/src/main/scala/org/clulab/numeric/package.scala +++ b/main/src/main/scala/org/clulab/numeric/package.scala @@ -74,19 +74,9 @@ package object numeric { // // initialize entities and norms // - for(s <- doc.sentences) { - if(s.entities.isEmpty) { - s.entities = Some(new Array[String](s.size)) - for(i <- s.entities.get.indices) { - s.entities.get(i) = "O" - } - } - if(s.norms.isEmpty) { - s.norms = Some(new Array[String](s.size)) - for(i <- s.norms.get.indices) { - s.norms.get(i) = "" - } - } + for (sentence <- doc.sentences) { + sentence.entities = sentence.entities.orElse(Some(Array.fill(sentence.size)("O"))) + sentence.norms = sentence.norms .orElse(Some(Array.fill(sentence.size)(""))) } // diff --git a/main/src/main/scala/org/clulab/odin/ExtractorEngine.scala b/main/src/main/scala/org/clulab/odin/ExtractorEngine.scala index cb687b2cd..03f42ac60 100644 --- a/main/src/main/scala/org/clulab/odin/ExtractorEngine.scala +++ b/main/src/main/scala/org/clulab/odin/ExtractorEngine.scala @@ -1,14 +1,15 @@ package org.clulab.odin +import org.clulab.odin +import org.clulab.odin.impl.{ Extractor, RuleReader } +import org.clulab.processors.Document + import java.io._ import java.nio.charset.Charset import java.nio.charset.StandardCharsets.UTF_8 - import scala.io.{ Codec, Source } import scala.reflect.ClassTag -import org.clulab.processors.Document -import org.clulab.odin -import org.clulab.odin.impl.{ Extractor, RuleReader } +import scala.util.Using class ExtractorEngine(val extractors: Vector[Extractor], val globalAction: Action) { @@ -140,18 +141,17 @@ object ExtractorEngine { private def read(file: File, charset: Charset): String = { implicit val codec: Codec = new Codec(charset) - val source = Source.fromFile(file) - val text = source.mkString - source.close() - text + Using.resource(Source.fromFile(file)) { source => + val text = source.mkString + text + } } private def read(stream: InputStream, charset: Charset): String = { - implicit val codec: Codec = new Codec(charset) - val source = Source.fromInputStream(stream) - val text = source.mkString - source.close() - text + Using.resource (Source.fromInputStream(stream)(new Codec(charset))) { source => + val text = source.mkString + text + } } def fromFile( diff --git a/main/src/main/scala/org/clulab/odin/Mention.scala b/main/src/main/scala/org/clulab/odin/Mention.scala index fbf0e5e1b..4cc785efd 100644 --- a/main/src/main/scala/org/clulab/odin/Mention.scala +++ b/main/src/main/scala/org/clulab/odin/Mention.scala @@ -1,12 +1,13 @@ package org.clulab.odin -import scala.util.matching.Regex -import scala.util.hashing.MurmurHash3._ -import org.clulab.struct.Interval +import org.clulab.odin.impl.StringMatcher import org.clulab.processors.Document import org.clulab.scala.WrappedArray._ +import org.clulab.struct.Interval import org.clulab.utils.DependencyUtils -import org.clulab.odin.impl.StringMatcher +import org.clulab.utils.Hash + +import scala.util.matching.Regex trait Mention extends Equals with Ordered[Mention] with Serializable { @@ -126,6 +127,14 @@ trait Mention extends Equals with Ordered[Mention] with Serializable { case None => Nil } + /** returns the minimum distance to a root node for dependencies within the token interval */ + def distToRootOpt: Option[Int] = sentenceObj.dependencies.flatMap { deps => + // Note that + // Double.MaxValue.toInt == Int.MaxValue + // Double.PositiveInfinity.toInt == Int.MaxValue + DependencyUtils.distToRootOpt(tokenInterval, deps).map(_.toInt) + } + /** returns the syntactic head of `mention` */ def synHead: Option[Int] = synHeads.lastOption @@ -187,26 +196,29 @@ trait Mention extends Equals with Ordered[Mention] with Serializable { protected lazy val cachedHashCode = calculateHashCode - protected def calculateHashCode: Int = { - val h0 = stringHash("org.clulab.odin.Mention") - val h1 = mix(h0, labels.hashCode) - val h2 = mix(h1, tokenInterval.hashCode) - val h3 = mix(h2, sentence.hashCode) - val h4 = mix(h3, document.ambivalenceHash) - val h5 = mix(h4, argumentsHashCode) - val h6 = mixLast(h5, unorderedHash(attachments)) - finalizeHash(h6, 6) - } - - private def argumentsHashCode: Int = { - val h0 = stringHash("Mention.arguments") - val hs = arguments map { - case (name, args) => mix(stringHash(name), unorderedHash(args)) + protected def calculateHashCode: Int = Hash.withLast( + Hash("org.clulab.odin.Mention"), + labels.hashCode, + tokenInterval.hashCode, + sentence.hashCode, + document.ambivalenceHash, + argsHash, + Hash.unordered(attachments) + ) + + // TODO: Compare this to argsHash in the package. + private def argsHash: Int = { + val argHashes = arguments.map { case (name, mentions) => + val seed = Hash(name) + val data = mentions + + Hash.mix(seed, Hash.unordered(data)) } - val h = mixLast(h0, unorderedHash(hs)) - finalizeHash(h, arguments.size) + Hash.withLast(arguments.size)( + Hash("Mention.arguments"), + Hash.unordered(argHashes) + ) } - } @SerialVersionUID(1L) @@ -318,12 +330,11 @@ class EventMention( } // trigger should be part of the hashCode too - protected override def calculateHashCode: Int = { - val h0 = stringHash("org.clulab.odin.EventMention") - val h1 = mix(h0, super.calculateHashCode) - val h2 = mixLast(h1, trigger.hashCode) - finalizeHash(h2, 2) - } + protected override def calculateHashCode: Int = Hash.withLast( + Hash("org.clulab.odin.EventMention"), + super.calculateHashCode, + trigger.hashCode + ) // Copy constructor for EventMention def copy( diff --git a/main/src/main/scala/org/clulab/odin/impl/CustomRuleReader.scala b/main/src/main/scala/org/clulab/odin/impl/CustomRuleReader.scala new file mode 100644 index 000000000..9bb740e7b --- /dev/null +++ b/main/src/main/scala/org/clulab/odin/impl/CustomRuleReader.scala @@ -0,0 +1,46 @@ +package org.clulab.odin.impl + +import org.clulab.odin.Actions +import org.clulab.odin.impl.RuleReader.Rule +import org.yaml.snakeyaml.Yaml +import org.yaml.snakeyaml.constructor.Constructor + +import java.nio.charset.Charset +import java.util.{Collection, Map => JMap} + +/** This class addresses [[https://github.com/clulab/processors/issues/309]] + * + * Note: nothing is synchronized here, so don't manipulate the configs in a multi- + * threaded environment. + */ +class CustomRuleReader(actions: Actions, charset: Charset) extends RuleReader(actions, charset) { + /** whether the circumstances are right to capture the config in [[readRules]] */ + protected var captureConfig: Boolean = false + /** most-recent config generated in [[rulesFromMasterFile]] and then captured */ + protected var config: OdinConfig = OdinConfig(resources = OdinResourceManager(Map.empty)) + + /** Override that reuses the captured config */ + override protected def rulesFromSimpleFile(input: String): Seq[Rule] = { + val yaml = new Yaml(new Constructor(classOf[Collection[JMap[String, Any]]])) + val jRules = yaml.load(input).asInstanceOf[Collection[JMap[String, Any]]] + + readRules(jRules, this.config) + } + + /** Override that enables the config to be captured */ + override protected def rulesFromMasterFile(input: String): Seq[Rule] = { + // The superclass's version calls readRules and when this happens, we want the config + // to be captured. This saves us from reimplementation of the superclass's method. + captureConfig = true + super.rulesFromMasterFile(input) + } + + /** Override that *captures* the [[OdinConfig]] as a side-effect */ + override protected def readRules(rules: Collection[JMap[String, Any]], config: OdinConfig): Seq[Rule] = { + if (captureConfig) { + this.config = config + captureConfig = false + } + super.readRules(rules, config) + } +} diff --git a/main/src/main/scala/org/clulab/odin/impl/RuleReader.scala b/main/src/main/scala/org/clulab/odin/impl/RuleReader.scala index 814ff3fd0..b07c5219a 100644 --- a/main/src/main/scala/org/clulab/odin/impl/RuleReader.scala +++ b/main/src/main/scala/org/clulab/odin/impl/RuleReader.scala @@ -1,25 +1,22 @@ package org.clulab.odin.impl -import java.io.File -import java.net.URL -import java.util.{Collection, Map => JMap} -import java.nio.charset.Charset -import java.nio.charset.StandardCharsets - -import org.apache.commons.text.StrSubstitutor import org.apache.commons.io.FileUtils.readFileToString - -import scala.jdk.CollectionConverters._ -import scala.io.{Codec, Source} -import org.yaml.snakeyaml.Yaml -import org.yaml.snakeyaml.constructor.{Constructor, ConstructorException} +import org.apache.commons.text.StrSubstitutor import org.clulab.odin._ import org.clulab.odin.impl.MarkdownGeneration._ import org.clulab.scala.WrappedArray._ import org.clulab.utils.FileUtils -import org.clulab.utils.Closer.AutoCloser - +import org.yaml.snakeyaml.Yaml +import org.yaml.snakeyaml.constructor.{Constructor, ConstructorException} +import java.io.File +import java.net.URL +import java.nio.charset.Charset +import java.nio.charset.StandardCharsets +import java.util.{Collection, Map => JMap} +import scala.io.{Codec, Source} +import scala.jdk.CollectionConverters._ +import scala.util.Using class RuleReader(val actions: Actions, val charset: Charset, val ruleDir: Option[File] = None) { @@ -44,7 +41,7 @@ class RuleReader(val actions: Actions, val charset: Charset, val ruleDir: Option } } - private def rulesFromSimpleFile(input: String): Seq[Rule] = { + protected def rulesFromSimpleFile(input: String): Seq[Rule] = { val yaml = new Yaml(new Constructor(classOf[Collection[JMap[String, Any]]])) val jRules = yaml.load(input).asInstanceOf[Collection[JMap[String, Any]]] // no resources are specified @@ -53,7 +50,7 @@ class RuleReader(val actions: Actions, val charset: Charset, val ruleDir: Option readRules(jRules, config) } - private def rulesFromMasterFile(input: String): Seq[Rule] = { + protected def rulesFromMasterFile(input: String): Seq[Rule] = { val yaml = new Yaml(new Constructor(classOf[JMap[String, Any]])) val master = yaml.load(input).asInstanceOf[JMap[String, Any]].asScala.toMap val taxonomy = master.get("taxonomy").map(readTaxonomy) @@ -197,7 +194,7 @@ class RuleReader(val actions: Actions, val charset: Charset, val ruleDir: Option } } - private def readRules( + protected def readRules( rules: Collection[JMap[String, Any]], config: OdinConfig ): Seq[Rule] = { @@ -254,10 +251,10 @@ class RuleReader(val actions: Actions, val charset: Charset, val ruleDir: Option readFileToString(f, StandardCharsets.UTF_8) case None => val url = mkURL(s) - val source = Source.fromURL(url) - val data = source.mkString - source.close() - data + Using.resource(Source.fromURL(url)) { source => + val data = source.mkString + data + } } } @@ -503,7 +500,7 @@ class RuleReader(val actions: Actions, val charset: Charset, val ruleDir: Option def exportRuleSchemas(input: String, outname: String): Unit = { val markdown = ruleSchemas(input) // export - FileUtils.printWriterFromFile(new File(outname)).autoClose { pw => + Using(FileUtils.printWriterFromFile(new File(outname))) { pw => pw.println(markdown) } } @@ -559,7 +556,7 @@ class RuleReader(val actions: Actions, val charset: Charset, val ruleDir: Option */ def exportExtractionSchemas(input: String, outname: String, minimal: Boolean = false): Unit = { val markdown = extractionSchemas(input, minimal) - FileUtils.printWriterFromFile(new File(outname)).autoClose { pw => + Using.resource(FileUtils.printWriterFromFile(new File(outname))) { pw => pw.println(markdown) } } diff --git a/main/src/main/scala/org/clulab/odin/serialization/json/package.scala b/main/src/main/scala/org/clulab/odin/serialization/json/package.scala index 9ffeff3c7..686694b8c 100644 --- a/main/src/main/scala/org/clulab/odin/serialization/json/package.scala +++ b/main/src/main/scala/org/clulab/odin/serialization/json/package.scala @@ -3,9 +3,9 @@ package org.clulab.odin.serialization import org.clulab.odin import org.clulab.odin._ import org.clulab.struct.DirectedGraph +import org.clulab.utils.Hash import org.json4s._ import org.json4s.JsonDSL._ -import scala.util.hashing.MurmurHash3._ package object json { @@ -20,14 +20,18 @@ package object json { } /** Hash representing the [[Mention.arguments]] */ - private def argsHash(args: Map[String, Seq[Mention]]): Int = { - val argHashes = for { - (role, mns) <- args - bh = stringHash(s"role:$role") - hs = mns.map(_.equivalenceHash) - } yield mix(bh, unorderedHash(hs)) - val h0 = stringHash("org.clulab.odin.Mention.arguments") - finalizeHash(h0, unorderedHash(argHashes)) + // TODO: Compare this to Mention.argsHash(). + private def argsHash(arguments: Map[String, Seq[Mention]]): Int = { + val argHashes = arguments.map { case (name, mentions) => + val seed = Hash(s"role:$name") + val data = mentions.map(_.equivalenceHash) + + Hash.mix(seed, Hash.unordered(data)) + } + // TODO: This is not the proper use of the count. + Hash.withLast(Hash.unordered(argHashes))( + Hash("org.clulab.odin.Mention.arguments") + ) } private def pathsAST(paths: Map[String, Map[Mention, odin.SynPath]]): JValue = paths match { @@ -78,21 +82,14 @@ package object json { val stringCode = s"org.clulab.odin.${TextBoundMention.string}" - def equivalenceHash: Int = { - // the seed (not counted in the length of finalizeHash) - val h0 = stringHash(stringCode) - // labels - val h1 = mix(h0, tb.labels.hashCode) - // interval.start - val h2 = mix(h1, tb.tokenInterval.start) - // interval.end - val h3 = mix(h2, tb.tokenInterval.end) - // sentence index - val h4 = mix(h3, tb.sentence) - // document.equivalenceHash - val h5 = mix(h4, tb.document.equivalenceHash) - finalizeHash(h5, 5) - } + def equivalenceHash: Int = Hash( + Hash(stringCode), + tb.labels.hashCode, + tb.tokenInterval.start, + tb.tokenInterval.end, + tb.sentence, + tb.document.equivalenceHash + ) override def id: String = s"${TextBoundMention.shortString}:$equivalenceHash" @@ -116,25 +113,16 @@ package object json { val stringCode = s"org.clulab.odin.${EventMention.string}" - def equivalenceHash: Int = { - // the seed (not counted in the length of finalizeHash) - val h0 = stringHash(stringCode) - // labels - val h1 = mix(h0, em.labels.hashCode) - // interval.start - val h2 = mix(h1, em.tokenInterval.start) - // interval.end - val h3 = mix(h2, em.tokenInterval.end) - // sentence index - val h4 = mix(h3, em.sentence) - // document.equivalenceHash - val h5 = mix(h4, em.document.equivalenceHash) - // args - val h6 = mix(h5, argsHash(em.arguments)) - // trigger - val h7 = mix(h6, TextBoundMentionOps(em.trigger).equivalenceHash) - finalizeHash(h7, 7) - } + def equivalenceHash: Int = Hash( + Hash(stringCode), + em.labels.hashCode, + em.tokenInterval.start, + em.tokenInterval.end, + em.sentence, + em.document.equivalenceHash, + argsHash(em.arguments), + TextBoundMentionOps(em.trigger).equivalenceHash + ) override def id: String = s"${EventMention.shortString}:$equivalenceHash" @@ -162,23 +150,15 @@ package object json { val stringCode = s"org.clulab.odin.${RelationMention.string}" - def equivalenceHash: Int = { - // the seed (not counted in the length of finalizeHash) - val h0 = stringHash(stringCode) - // labels - val h1 = mix(h0, rm.labels.hashCode) - // interval.start - val h2 = mix(h1, rm.tokenInterval.start) - // interval.end - val h3 = mix(h2, rm.tokenInterval.end) - // sentence index - val h4 = mix(h3, rm.sentence) - // document.equivalenceHash - val h5 = mix(h4, rm.document.equivalenceHash) - // args - val h6 = mix(h5, argsHash(rm.arguments)) - finalizeHash(h6, 6) - } + def equivalenceHash: Int = Hash( + Hash(stringCode), + rm.labels.hashCode, + rm.tokenInterval.start, + rm.tokenInterval.end, + rm.sentence, + rm.document.equivalenceHash, + argsHash(rm.arguments) + ) override def id: String = s"${RelationMention.shortString}:$equivalenceHash" @@ -205,23 +185,15 @@ package object json { val stringCode = s"org.clulab.odin.${CrossSentenceMention.string}" - def equivalenceHash: Int = { - // the seed (not counted in the length of finalizeHash) - val h0 = stringHash(stringCode) - // labels - val h1 = mix(h0, csm.labels.hashCode) - // interval.start - val h2 = mix(h1, csm.tokenInterval.start) - // interval.end - val h3 = mix(h2, csm.tokenInterval.end) - // sentence index - val h4 = mix(h3, csm.sentence) - // document.equivalenceHash - val h5 = mix(h4, csm.document.equivalenceHash) - // args - val h6 = mix(h5, argsHash(csm.arguments)) - finalizeHash(h6, 6) - } + def equivalenceHash: Int = Hash( + Hash(stringCode), + csm.labels.hashCode, + csm.tokenInterval.start, + csm.tokenInterval.end, + csm.sentence, + csm.document.equivalenceHash, + argsHash(csm.arguments) + ) override def id: String = s"${CrossSentenceMention.shortString}:$equivalenceHash" diff --git a/main/src/main/scala/org/clulab/processors/Document.scala b/main/src/main/scala/org/clulab/processors/Document.scala index 151e7fa52..6435ab94c 100644 --- a/main/src/main/scala/org/clulab/processors/Document.scala +++ b/main/src/main/scala/org/clulab/processors/Document.scala @@ -3,13 +3,13 @@ package org.clulab.processors import java.io.PrintWriter import org.clulab.struct.{CorefChains, DirectedGraphEdgeIterator} +import org.clulab.utils.Hash import org.clulab.utils.Serializer import org.json4s.JString import org.json4s.JValue import org.json4s.jackson.prettyJson import scala.collection.mutable -import scala.util.hashing.MurmurHash3._ /** * Stores all annotations for one document. @@ -47,25 +47,24 @@ class Document(val sentences: Array[Sentence]) extends Serializable { // Used by equivalenceHash. // return an Int hash based on the Sentence.equivalenceHash of each sentence def sentencesHash: Int = { - val h0 = stringHash(s"$stringCode.sentences") val hs = sentences.map(_.equivalenceHash) - val h = mixLast(h0, unorderedHash(hs)) - finalizeHash(h, sentences.length) + + Hash.withLast(sentences.length)( + Hash(s"$stringCode.sentences"), + Hash.unordered(hs) // TODO: This should be ordered. + ) } - // the seed (not counted in the length of finalizeHash) - // decided to use the class name - val h0 = stringHash(stringCode) - // comprised of the equiv. hash of sentences - val h1 = mix(h0, sentencesHash) - finalizeHash(h1, 1) + Hash( + Hash(stringCode), + sentencesHash + ) } - def ambivalenceHash: Int = { - val h0 = stringHash(Document.getClass.getName) - val h1 = mix(h0, orderedHash(sentences.map(_.ambivalenceHash))) - finalizeHash(h1, 1) - } + def ambivalenceHash: Int = Hash( + Hash(Document.getClass.getName), + Hash.ordered(sentences.map(_.ambivalenceHash)) + ) /** Adds an attachment to the document's attachment map */ def addAttachment(name: String, attachment: DocumentAttachment): Unit = { @@ -178,16 +177,21 @@ class Document(val sentences: Array[Sentence]) extends Serializable { }) } - def copy(document: Document): Document = { + def assimilate(document: Document, textOpt: Option[String]): Document = { id = document.id coreferenceChains = document.coreferenceChains - text = document.text + text = textOpt attachments = document.attachments documentCreationTime = document.documentCreationTime this } - def copy(sentences: Array[Sentence] = sentences): Document = new Document(sentences).copy(this) + // sentences are a val, so they must be initialized through the construction of a new Document. + // Thereafter, the remaining values can be assimilated from the old document. The shortcut + // is used so that subclasses don't have to duplicate almost everything in their copy. + def copy(sentences: Array[Sentence] = sentences, textOpt: Option[String] = text): Document = { + new Document(sentences).assimilate(this, textOpt) + } def offset(offset: Int): Document = // If a subclass of Document constructs itself with an attachment or a documentCreationTime that diff --git a/main/src/main/scala/org/clulab/processors/Sentence.scala b/main/src/main/scala/org/clulab/processors/Sentence.scala index 5bdb16fd2..0465226c1 100644 --- a/main/src/main/scala/org/clulab/processors/Sentence.scala +++ b/main/src/main/scala/org/clulab/processors/Sentence.scala @@ -3,10 +3,10 @@ package org.clulab.processors import org.clulab.scala.WrappedArray._ import org.clulab.struct.{DirectedGraph, GraphMap, RelationTriple, Tree} import org.clulab.struct.GraphMap._ +import org.clulab.utils.Hash import org.clulab.utils.SeqUtils import scala.collection.mutable -import scala.util.hashing.MurmurHash3._ /** Stores the annotations for a single sentence */ class Sentence( @@ -52,46 +52,45 @@ class Sentence( protected lazy val cachedAmbivalenceHash = calculateAmbivalenceHash - protected def calculateAmbivalenceHash: Int = { - val h0 = stringHash(Sentence.getClass.getName) - val h1 = mix(h0, orderedHash(raw)) - val h2 = mix(h1, orderedHash(startOffsets)) - val h3 = mix(h2, orderedHash(endOffsets)) - finalizeHash(h3, 3) - } + protected def calculateAmbivalenceHash: Int = Hash( + Hash(Sentence.getClass.getName), + Hash.ordered(raw), + Hash.ordered(startOffsets), + Hash.ordered(endOffsets) + ) /** * Used to compare Sentences. * @return a hash (Int) based on the contents of a sentence */ def equivalenceHash: Int = { - val stringCode = "org.clulab.processors.Sentence" - def getAnnotationsHash(labels: Option[Array[_]]): Int = labels match { - case Some(lbls) => - val h0 = stringHash(s"$stringCode.annotations") - val hs = lbls.map(_.hashCode) - val h = mixLast(h0, orderedHash(hs)) - finalizeHash(h, lbls.length) - case None => None.hashCode - } - - // the seed (not counted in the length of finalizeHash) - // decided to use the class name - val h0 = stringHash(stringCode) - // NOTE: words.hashCode will produce inconsistent values - val h1a = mix(h0, getAnnotationsHash(Some(raw))) - val h1b = mix(h1a, getAnnotationsHash(Some(words))) - val h2 = mix(h1b, getAnnotationsHash(Some(startOffsets))) - val h3 = mix(h2, getAnnotationsHash(Some(endOffsets))) - val h4 = mix(h3, getAnnotationsHash(tags)) - val h5 = mix(h4, getAnnotationsHash(lemmas)) - val h6 = mix(h5, getAnnotationsHash(entities)) - val h7 = mix(h6, getAnnotationsHash(norms)) - val h8 = mix(h7, getAnnotationsHash(chunks)) - val h9 = mix(h8, if (dependencies.nonEmpty) dependencies.get.equivalenceHash else None.hashCode) - finalizeHash(h9, 10) + def getAnnotationsHash(labelsOpt: Option[Array[_]]): Int = labelsOpt + .map { labels => + val hs = labels.map(_.hashCode) + val result = Hash.withLast(labels.length)( + Hash(s"$stringCode.annotations"), + Hash.ordered(hs) + ) + + result + } + .getOrElse(None.hashCode) + + Hash( + Hash(stringCode), + getAnnotationsHash(Some(raw)), + getAnnotationsHash(Some(words)), + getAnnotationsHash(Some(startOffsets)), + getAnnotationsHash(Some(endOffsets)), + getAnnotationsHash(tags), + getAnnotationsHash(lemmas), + getAnnotationsHash(entities), + getAnnotationsHash(norms), + getAnnotationsHash(chunks), + if (dependencies.nonEmpty) dependencies.get.equivalenceHash else None.hashCode + ) } /** @@ -173,7 +172,7 @@ class Sentence( reverted } - def copy(sentence: Sentence): Sentence = { + def assimilate(sentence: Sentence): Sentence = { tags = sentence.tags lemmas = sentence.lemmas entities = sentence.entities @@ -186,7 +185,7 @@ class Sentence( } def copy(raw: Array[String] = raw, startOffsets: Array[Int] = startOffsets, endOffsets: Array[Int] = endOffsets, words: Array[String] = words): Sentence = - new Sentence(raw, startOffsets, endOffsets, words).copy(this) + new Sentence(raw, startOffsets, endOffsets, words).assimilate(this) def offset(offset: Int): Sentence = { if (offset == 0) this diff --git a/main/src/main/scala/org/clulab/processors/clu/BalaurProcessor.scala b/main/src/main/scala/org/clulab/processors/clu/BalaurProcessor.scala index e41bdb926..e1b9e5718 100644 --- a/main/src/main/scala/org/clulab/processors/clu/BalaurProcessor.scala +++ b/main/src/main/scala/org/clulab/processors/clu/BalaurProcessor.scala @@ -10,7 +10,6 @@ import org.clulab.scala_transformers.encoder.TokenClassifier import org.clulab.scala_transformers.encoder.EncoderMaxTokensRuntimeException import org.clulab.sequences.{LexiconNER, NamedEntity} import org.clulab.struct.DirectedGraph -import org.clulab.struct.Edge import org.clulab.struct.GraphMap import org.clulab.utils.{Configured, MathUtils, ToEnhancedDependencies} import org.slf4j.{Logger, LoggerFactory} @@ -166,7 +165,7 @@ class BalaurProcessor protected ( private def assignPosTags(labels: Array[Array[(String, Float)]], sent: Sentence): Unit = { assert(labels.length == sent.words.length) - sent.tags = Some(postprocessPartOfSpeechTags(sent.words, labels.map(_.head._1))) + sent.tags = Some(postprocessPartOfSpeechTags(sent.words, labels.map(_.head._1).toArray)) } /** Must be called after assignPosTags and lemmatize because it requires Sentence.tags and Sentence.lemmas */ @@ -193,7 +192,7 @@ class BalaurProcessor protected ( ner.find(sentence) } - val genericLabels = NamedEntity.patch(labels.map(_.head._1)) + val genericLabels = NamedEntity.patch(labels.map(_.head._1).toArray) if(optionalNERLabels.isEmpty) { sent.entities = Some(genericLabels) @@ -228,7 +227,7 @@ class BalaurProcessor protected ( private def assignChunkLabels(labels: Array[Array[(String, Float)]], sent: Sentence): Unit = { assert(labels.length == sent.words.length) - sent.chunks = Some(labels.map(_.head._1)) + sent.chunks = Some(labels.map(_.head._1).toArray) } // The head has one score, the label has another. Here the two scores are interpolated @@ -260,10 +259,10 @@ class BalaurProcessor protected ( // valid Dependencies remain. val sortedWordDependencies = wordDependencies.sortBy(-_.score) - sortedWordDependencies + sortedWordDependencies.toArray } - sentDependencies + sentDependencies.toArray } // sent = sentence, word = word @@ -294,11 +293,15 @@ class BalaurProcessor protected ( val enhancedDepGraph = ToEnhancedDependencies.generateUniversalEnhancedDependencies(sent, depGraph) sent.graphs += GraphMap.UNIVERSAL_ENHANCED -> enhancedDepGraph + + // ideally, hybrid dependencies should contain both syntactic dependencies and semantic roles + // however, this processor produces only syntactic dependencies + sent.graphs += GraphMap.HYBRID_DEPENDENCIES -> enhancedDepGraph } def greedilyGenerateOutput(sentDependencies: Array[Array[Dependency]]): Array[Dependency] = { // These are already sorted by score, so head will extract the best one. - sentDependencies.map(_.head) + sentDependencies.map(_.head).toArray } } diff --git a/main/src/main/scala/org/clulab/processors/clu/Veil.scala b/main/src/main/scala/org/clulab/processors/clu/Veil.scala new file mode 100644 index 000000000..9e156bf5c --- /dev/null +++ b/main/src/main/scala/org/clulab/processors/clu/Veil.scala @@ -0,0 +1,244 @@ +package org.clulab.processors.clu + +import org.clulab.processors.{Document, Processor, Sentence} +import org.clulab.serialization.DocumentSerializer +import org.clulab.struct.{DirectedGraph, Edge, GraphMap, RelationTriple, Tree} +import org.clulab.struct.GraphMap._ + +import java.io.PrintWriter +import scala.collection.mutable.{Set => MutableSet} +import scala.util.Using + +trait Veil + +object Veil { + val veiledTag = "" + val veiledLemma = "" + val veiledEntity = "" + val veiledNorm = "" + val veiledChunk = "" +} + +/** Manipulate a document with veiled text + * + * @param originalText text that has not yet been veiled + * @param veiledLetters a sequence of ranges which specify by index which letters in the original text to veil + * when a document is created with mkDocument(processor) + * + * See [[VeilApp.veilText]] for an example. + */ +class VeiledText(originalText: String, veiledLetters: Seq[Range]) extends Veil { + /** This is a set containing veiled letter indices. + * They have been vetted and deduplicated. + */ + protected lazy val veilSet: MutableSet[Int] = { + val set = MutableSet.empty[Int] + val letterIndexes = originalText.indices + + veiledLetters.foreach { letterRange => + letterRange.foreach { letterIndex => + letterIndexes.lift(letterIndex).foreach(set += _) + } + } + set + } + protected lazy val veiledText: String = { + val letters = new StringBuffer(originalText) + + veilSet.foreach(letters.setCharAt(_, ' ')) + letters.toString + } + + protected def unveilDocument(veiledDocument: Document): Document = { + val unveiledDocument = veiledDocument.copy(textOpt = Some(originalText)) + + unveiledDocument + } + + def mkDocument(processor: Processor): Document = { + val veiledDocument = processor.mkDocument(veiledText) + val unveiledDocument = unveilDocument(veiledDocument) + + unveiledDocument + } +} + +/** Manipulate a document with text veiled by word + * + * @param originalDocument a document that has not yet been veiled + * @param veiledWords a sequence of (integer, range) pairs which specify by sentence index and then word index range + * which words of a document to veil during annotation with annotate(processor) + * + * See [[VeilApp.veilDocument]] for an example. + */ +class VeiledDocument(originalDocument: Document, veiledWords: Seq[(Int, Range)]) extends Veil { + /** This is an array of sets, each containing veiled word indices for each sentence. + * They have been vetted and deduplicated. + */ + protected lazy val veilSets: Array[MutableSet[Int]] = { + val sets = Array.fill(originalDocument.sentences.length)(MutableSet.empty[Int]) + + veiledWords.foreach { case (sentenceIndex, wordRange) => + sets.lift(sentenceIndex).foreach { set => + val wordIndexes = originalDocument.sentences(sentenceIndex).words.indices + + wordRange.foreach { wordIndex => + wordIndexes.lift(wordIndex).foreach(set += _) + } + } + } + sets + } + /** There is one array per sentence and it contains at each index the index where a value (e.g., word in + * an array of words) should be transferred as it is unveiled. Code using the unveilArrays might look like + * unveiledValues(unveilArrays(sentenceIndex)(veiledIndex)) = veiledValues(veiledIndex) + */ + protected lazy val unveilArrays = { + val arrays = originalDocument.sentences.zip(veilSets).map { case (originalSentence, set) => + val array = new Array[Int](originalSentence.words.length - set.size) + var unveiledIndex = 0 + + array.indices.foreach { veiledIndex => + while (set(unveiledIndex)) + unveiledIndex += 1 + array(veiledIndex) = unveiledIndex + unveiledIndex += 1 + } + array + } + + arrays + } + protected lazy val veiledDocument = { + val veiledSentences = originalDocument.sentences.zipWithIndex.map { case (originalSentence, sentenceIndex) => + val wordIndexes = originalSentence.words.indices.filterNot(veilSets(sentenceIndex)).toArray + val veiledRaw = wordIndexes.map(originalSentence.raw) + val veiledStartOffsets = wordIndexes.map(originalSentence.startOffsets) + val veiledEndOffsets = wordIndexes.map(originalSentence.endOffsets) + val veiledWords = wordIndexes.map(originalSentence.words) + val veiledSentence = originalSentence.copy(veiledRaw, veiledStartOffsets, veiledEndOffsets, veiledWords) + + veiledSentence + } + + originalDocument.copy(veiledSentences) + } + + def unveilStringArray(veiledArrayOpt: Option[Array[String]], sentenceIndex: Int, veil: String): Option[Array[String]] = { + val unveilArray = unveilArrays(sentenceIndex) + val originalLength = originalDocument.sentences(sentenceIndex).words.length + + veiledArrayOpt.map { veiledArray => + val unveiledArray = Array.fill(originalLength)(veil) + + veiledArray.zipWithIndex.foreach { case (veiledString, veiledIndex) => + unveiledArray(unveilArray(veiledIndex)) = veiledString + } + unveiledArray + } + } + + def unveilGraphs(veiledGraphs: GraphMap, sentenceIndex: Int): GraphMap = { + val unveilArray = unveilArrays(sentenceIndex) + val unveiledGraphs = GraphMap() + val originalLength = originalDocument.sentences(sentenceIndex).words.length + + veiledGraphs.foreach { case (name, veiledDirectedGraph) => + val unveiledEdges = veiledDirectedGraph.allEdges.map { case (veiledSource, veiledDestination, relation) => + Edge(unveilArray(veiledSource), unveilArray(veiledDestination), relation) + } + val unveiledRoots = veiledDirectedGraph.roots.map(unveilArray) + + unveiledGraphs(name) = new DirectedGraph(unveiledEdges, Some(originalLength), Some(unveiledRoots)) + } + unveiledGraphs + } + + // TODO + def unveilSyntacticTree(syntacticTreeOpt: Option[Tree]): Option[Tree] = syntacticTreeOpt + + // TODO + def unveilRelations(relations: Option[Array[RelationTriple]]): Option[Array[RelationTriple]] = relations + + protected def unveilSentence(veiledSentence: Sentence, sentenceIndex: Int): Sentence = { + val originalSentence = originalDocument.sentences(sentenceIndex) + val unveiledRaw = originalSentence.raw + val unveiledStartOffsets = originalSentence.startOffsets + val unveiledEndOffsets = originalSentence.endOffsets + val unveiledWords = originalSentence.words + val unveiledSentence = veiledSentence.copy(unveiledRaw, unveiledStartOffsets, unveiledEndOffsets, unveiledWords) + + def unveilStringArray(veiledArrayOpt: Option[Array[String]], veil: String): Option[Array[String]] = + this.unveilStringArray(veiledArrayOpt, sentenceIndex, veil) + + unveiledSentence.tags = unveilStringArray(unveiledSentence.tags, Veil.veiledTag) + unveiledSentence.lemmas = unveilStringArray(unveiledSentence.lemmas, Veil.veiledLemma) + unveiledSentence.entities = unveilStringArray(unveiledSentence.entities, Veil.veiledEntity) + unveiledSentence.norms = unveilStringArray(unveiledSentence.norms, Veil.veiledNorm) + unveiledSentence.chunks = unveilStringArray(unveiledSentence.chunks, Veil.veiledChunk) + + unveiledSentence.syntacticTree = unveilSyntacticTree(unveiledSentence.syntacticTree) + unveiledSentence.graphs = unveilGraphs(unveiledSentence.graphs, sentenceIndex) + unveiledSentence.relations = unveilRelations(unveiledSentence.relations) + unveiledSentence + } + + protected def unveilDocument(veiledDocument: Document): Document = { + val unveiledSentences = veiledDocument.sentences.zipWithIndex.map { case (veiledSentence, sentenceIndex) => + unveilSentence(veiledSentence, sentenceIndex) + } + val unveiledAnnotatedDocument = veiledDocument.copy(unveiledSentences) + + unveiledAnnotatedDocument + } + + def annotate(processor: Processor): Document = { + val veiledAnnotatedDocument = processor.annotate(veiledDocument) + val unveiledAnnotatedDocument = unveilDocument(veiledAnnotatedDocument) + + unveiledAnnotatedDocument + } +} + +/** Demonstrate how either parts of the text or Document can be veiled. + */ +object VeilApp extends App { + + /** Treat this text as if the letters "(Hahn-Powell, 2012)" did not exist + * for the purpose of mkDocument, but do include them in the text. + */ + def veilText(processsor: Processor): Unit = { + val text = "To be loved by unicorns is the greatest gift of all (Hahn-Powell, 2012)." + val veiledLetters = Seq(Range.inclusive(text.indexOf('('), text.indexOf(')'))) + val veiledText = new VeiledText(text, veiledLetters) + val document: Document = veiledText.mkDocument(processor) + + Using.resource(new PrintWriter("veiledLetters.out")) { printWriter => + val documentSerializer = new DocumentSerializer() + + documentSerializer.save(document, printWriter) + } + } + + /** Treat this text as if the words "( Hahn-Powell , 2012 )" did not exist + * for the purpose of annotate, but do include them in the document. + */ + def veilDocument(processor: Processor): Unit = { + val text = "To be loved by unicorns is the greatest gift of all (Hahn-Powell, 2012)." + val document: Document = processor.mkDocument(text) + val veiledWords = Seq((0, Range.inclusive(document.sentences(0).raw.indexOf("("), document.sentences(0).raw.indexOf(")")))) + val veiledDocument = new VeiledDocument(document, veiledWords) + val annotatedDocument: Document = veiledDocument.annotate(processor) + + Using.resource(new PrintWriter("veiledWords.out")) { printWriter => + val documentSerializer = new DocumentSerializer() + + documentSerializer.save(annotatedDocument, printWriter) + } + } + + val processor = new BalaurProcessor() + + veilText(processor) + veilDocument(processor) +} diff --git a/main/src/main/scala/org/clulab/processors/clu/package.scala b/main/src/main/scala/org/clulab/processors/clu/package.scala new file mode 100644 index 000000000..d14269f9b --- /dev/null +++ b/main/src/main/scala/org/clulab/processors/clu/package.scala @@ -0,0 +1,6 @@ +package org.clulab.processors + +package object clu { + type CluProcessor = BalaurProcessor // This takes care of the class. + val CluProcessor = BalaurProcessor // This takes care of the companion object. +} diff --git a/main/src/main/scala/org/clulab/processors/clu/tokenizer/SentenceSplitter.scala b/main/src/main/scala/org/clulab/processors/clu/tokenizer/SentenceSplitter.scala index f38c184c3..f644da4f0 100644 --- a/main/src/main/scala/org/clulab/processors/clu/tokenizer/SentenceSplitter.scala +++ b/main/src/main/scala/org/clulab/processors/clu/tokenizer/SentenceSplitter.scala @@ -1,11 +1,11 @@ package org.clulab.processors.clu.tokenizer -import java.io.{BufferedReader, InputStreamReader} - import org.clulab.processors.Sentence +import java.io.{BufferedReader, InputStreamReader} import scala.collection.mutable.ArrayBuffer import scala.util.matching.Regex +import scala.util.Using import SentenceSplitter._ @@ -199,25 +199,25 @@ object SentenceSplitter { private def loadDictionary(rn:String): Regex = { val is = SentenceSplitter.getClass.getClassLoader.getResourceAsStream(rn) assert(is != null, s"Failed to find resource $rn in the classpath!") - val reader = new BufferedReader(new InputStreamReader(is)) val regex = new StringBuilder regex.append("^(") - var done = false - var first = true - while(! done) { - val line = reader.readLine() - if(line == null) { - done = true - } else if(! line.startsWith("#")) { // skip comments - if(! first) regex.append("|") - regex.append(normalizeSpecialChars(line.trim)) - first = false + Using.resource(new BufferedReader(new InputStreamReader(is))) { reader => + var done = false + var first = true + while (!done) { + val line = reader.readLine() + if (line == null) { + done = true + } else if (!line.startsWith("#")) { // skip comments + if (!first) regex.append("|") + regex.append(normalizeSpecialChars(line.trim)) + first = false + } } } regex.append(")$") - reader.close() regex.toString.r } diff --git a/main/src/main/scala/org/clulab/sequences/BiMEMMSequenceTagger.scala b/main/src/main/scala/org/clulab/sequences/BiMEMMSequenceTagger.scala index 2d38ddd5e..3278df5f2 100644 --- a/main/src/main/scala/org/clulab/sequences/BiMEMMSequenceTagger.scala +++ b/main/src/main/scala/org/clulab/sequences/BiMEMMSequenceTagger.scala @@ -1,7 +1,5 @@ package org.clulab.sequences -import java.io._ - import org.clulab.learning._ import org.clulab.processors.{Document, Sentence} import org.clulab.scala.WrappedArray._ @@ -10,8 +8,10 @@ import org.clulab.sequences.SequenceTaggerLogger._ import org.clulab.struct.Counter import org.clulab.utils.SeqUtils +import java.io._ import scala.collection.mutable.ArrayBuffer import scala.reflect.ClassTag +import scala.util.Using /** * Bidirectional MEMM sequence tagger @@ -49,18 +49,18 @@ abstract class BiMEMMSequenceTagger[L: ClassTag, F: ClassTag]( val firstPassFile = new File(FIRST_PASS_FILE) firstPassLabels = if(firstPassFile.exists()) { logger.debug(s"Found cached file with first-pass labels: $FIRST_PASS_FILE") - val source = scala.io.Source.fromFile(firstPassFile) - val labels = readFirstPassLabels(source) - source.close() + val labels = Using.resource(scala.io.Source.fromFile(firstPassFile)) { source => + readFirstPassLabels(source) + } Some(labels) } else { logger.debug("Generating first-pass labels from scratch...") val labels = mkFirstPassLabels(sentences) - val pw = new PrintWriter(new FileWriter(FIRST_PASS_FILE)) - for(s <- labels) { - pw.println(s.mkString("\t")) + Using.resource(new PrintWriter(FIRST_PASS_FILE)) { pw => + for (s <- labels) { + pw.println(s.mkString("\t")) + } } - pw.close() Some(labels) } assert(firstPassLabels.get.length >= sentences.size) @@ -245,27 +245,27 @@ abstract class BiMEMMSequenceTagger[L: ClassTag, F: ClassTag]( private def mkDatum(label:L, features:Counter[F]): Datum[L, F] = new RVFDatum[L, F](label, features) private def mkClassifier: Classifier[L, F] = new L1LogisticRegressionClassifier[L, F]() // TODO: add all classifiers private def mkFullFold(size:Int): DatasetFold = - new DatasetFold(testFold = Tuple2(-1, -1), trainFolds = List(Tuple2(0, size))) + new DatasetFold(testFold = (-1, -1), trainFolds = List((0, size))) override def save(fn:File): Unit = { // save meta data - var w = new PrintWriter(new FileWriter(fn)) - w.println(order) - w.println(leftToRight) + Using.resource(new PrintWriter(fn)) { w => + w.println(order) + w.println(leftToRight) - // save second pass model - secondPassModel.get.saveTo(w) - w.close() + // save second pass model + secondPassModel.get.saveTo(w) + } // save first pass model (if any) - w = new PrintWriter(new FileWriter(fn, true)) - if(firstPassModel.nonEmpty) { - w.println(1) - firstPassModel.get.saveTo(w) - } else { - w.println(0) + Using.resource(new PrintWriter(new FileWriter(fn, true))) { w => + if (firstPassModel.nonEmpty) { + w.println(1) + firstPassModel.get.saveTo(w) + } else { + w.println(0) + } } - w.close() } override def load(reader:BufferedReader): Unit = { @@ -284,6 +284,5 @@ abstract class BiMEMMSequenceTagger[L: ClassTag, F: ClassTag]( } else { firstPassModel = None } - reader.close() } } diff --git a/main/src/main/scala/org/clulab/sequences/ColumnReader.scala b/main/src/main/scala/org/clulab/sequences/ColumnReader.scala index 8e8a89381..dad751c0f 100644 --- a/main/src/main/scala/org/clulab/sequences/ColumnReader.scala +++ b/main/src/main/scala/org/clulab/sequences/ColumnReader.scala @@ -1,10 +1,10 @@ package org.clulab.sequences -import org.clulab.utils.Closer.AutoCloser import org.clulab.utils.Sourcer import scala.collection.mutable.ArrayBuffer import scala.io.Source +import scala.util.Using /** * Reads the CoNLL-like column format @@ -12,7 +12,7 @@ import scala.io.Source object ColumnReader { def readColumns(fn: String): Array[Array[Row]] = { // That which opens the file should also close it, none other. - Sourcer.sourceFromFilename(fn).autoClose { source => + Using.resource(Sourcer.sourceFromFilename(fn)) { source => readColumns(source: Source) } } diff --git a/main/src/main/scala/org/clulab/sequences/ColumnsToDocument.scala b/main/src/main/scala/org/clulab/sequences/ColumnsToDocument.scala index 7f9738b70..37cc7dbe1 100644 --- a/main/src/main/scala/org/clulab/sequences/ColumnsToDocument.scala +++ b/main/src/main/scala/org/clulab/sequences/ColumnsToDocument.scala @@ -1,12 +1,13 @@ package org.clulab.sequences -import java.io.InputStream +import org.clulab.processors.{Document, Processor, Sentence} +import org.clulab.processors.clu.BalaurProcessor +import org.slf4j.{Logger, LoggerFactory} +import java.io.InputStream import scala.collection.mutable.ArrayBuffer import scala.io.Source -import org.clulab.processors.clu.BalaurProcessor -import org.clulab.processors.{Document, Processor, Sentence} -import org.slf4j.{Logger, LoggerFactory} +import scala.util.Using class ColumnsToDocument @@ -52,9 +53,9 @@ object ColumnsToDocument { this.prevLang = lang } - val source = Source.fromFile(fn) - - readFromSource(source, wordPos, labelPos, setLabels, annotate, filterOutContractions) + Using.resource(Source.fromFile(fn)) { source => + readFromSource(source, wordPos, labelPos, setLabels, annotate, filterOutContractions) + } } def readFromStream(stream:InputStream, @@ -79,8 +80,9 @@ object ColumnsToDocument { this.proc = new BalaurProcessor() } - val source = Source.fromInputStream(stream) - readFromSource(source, wordPos, labelPos, setLabels, annotate, filterOutContractions) + Using.resource(Source.fromInputStream(stream)) { source => + readFromSource(source, wordPos, labelPos, setLabels, annotate, filterOutContractions) + } } def readFromSource(source:Source, @@ -140,7 +142,6 @@ object ColumnsToDocument { s.tags = Some(labels.toArray) sentences += s } - source.close() logger.debug(s"Loaded ${sentences.size} sentences.") val d = new Document(sentences.toArray) diff --git a/main/src/main/scala/org/clulab/sequences/LexiconNERBuilder.scala b/main/src/main/scala/org/clulab/sequences/LexiconNERBuilder.scala index 2d473b8cc..4e0a31cf2 100644 --- a/main/src/main/scala/org/clulab/sequences/LexiconNERBuilder.scala +++ b/main/src/main/scala/org/clulab/sequences/LexiconNERBuilder.scala @@ -5,7 +5,6 @@ package org.clulab.sequences -import java.util.function.Consumer import org.clulab.scala.WrappedArray._ import org.clulab.scala.WrappedArrayBuffer._ import org.clulab.struct.BooleanHashTrie @@ -14,14 +13,15 @@ import org.clulab.struct.EntityValidator import org.clulab.struct.IntHashTrie import org.clulab.utils.FileUtils import org.clulab.utils.Files -import org.clulab.utils.Serializer import org.slf4j.Logger import org.slf4j.LoggerFactory import java.io.File -import scala.collection.mutable.{HashMap => MutableHashMap, HashSet => MutableHashSet, Map => MutableMap, Set => MutableSet} +import java.util.function.Consumer import scala.collection.mutable.ArrayBuffer +import scala.collection.mutable.{HashMap => MutableHashMap, HashSet => MutableHashSet, Map => MutableMap, Set => MutableSet} import scala.io.Source +import scala.util.Using /** * Concrete subclasses are responsible for building various NERs. The mapping is as follows: @@ -107,7 +107,7 @@ trait ResourceKbSource { } def consume(resourceName: String, consumer: Consumer[String]): Unit = { - Serializer.using(Files.loadStreamFromClasspath(resourceName)) { bufferedReader => + Using.resource(Files.loadStreamFromClasspath(resourceName)) { bufferedReader => bufferedReader.lines.forEach(consumer) } } @@ -146,7 +146,7 @@ trait FileKbSource { def consume(resourceName: String, baseDir: File, consumer: Consumer[String]): Unit = { val file = new File(baseDir, if (resourceName.startsWith("/")) resourceName.drop(1) else resourceName) - Serializer.using(Files.loadFile(file)) { bufferedReader => + Using.resource(Files.loadFile(file)) { bufferedReader => bufferedReader.lines.forEach(consumer) } } diff --git a/main/src/main/scala/org/clulab/sequences/MEMMSequenceTagger.scala b/main/src/main/scala/org/clulab/sequences/MEMMSequenceTagger.scala index dab2e0cf7..aa6ac8b47 100644 --- a/main/src/main/scala/org/clulab/sequences/MEMMSequenceTagger.scala +++ b/main/src/main/scala/org/clulab/sequences/MEMMSequenceTagger.scala @@ -1,7 +1,5 @@ package org.clulab.sequences -import java.io._ - import org.clulab.learning._ import org.clulab.processors.{Document, Sentence} import org.clulab.scala.WrappedArray._ @@ -10,8 +8,10 @@ import org.clulab.sequences.SequenceTaggerLogger._ import org.clulab.struct.Counter import org.clulab.utils.SeqUtils +import java.io._ import scala.collection.mutable.ArrayBuffer import scala.reflect.ClassTag +import scala.util.Using /** * Sequence tagger using a maximum entrop Markov model (MEMM) @@ -83,18 +83,16 @@ abstract class MEMMSequenceTagger[L: ClassTag, F: ClassTag](var order:Int = 1, v if(leftToRight) history.toArray else SeqUtils.revert(history).toArray } - override def save(fn:File): Unit = { - val w = new PrintWriter(new FileWriter(fn)) - w.println(order) - model.get.saveTo(w) - w.close() + override def save(file: File): Unit = { + Using.resource(new PrintWriter(file)) { w => + w.println(order) + model.get.saveTo(w) + } } - override def load(reader:BufferedReader): Unit = { + override def load(reader: BufferedReader): Unit = { order = reader.readLine().toInt val c = LiblinearClassifier.loadFrom[L, F] (reader) model = Some(c) } - } - diff --git a/main/src/main/scala/org/clulab/sequences/NormalizeParens.scala b/main/src/main/scala/org/clulab/sequences/NormalizeParens.scala index 5d0bdb764..fea08c464 100644 --- a/main/src/main/scala/org/clulab/sequences/NormalizeParens.scala +++ b/main/src/main/scala/org/clulab/sequences/NormalizeParens.scala @@ -1,8 +1,8 @@ package org.clulab.sequences -import java.io.{FileWriter, PrintWriter} - +import java.io.PrintWriter import scala.io.Source +import scala.util.Using /** * Transforms -LRB-, -LCB-, etc. tokens back into "(", "{", etc. @@ -14,24 +14,24 @@ import scala.io.Source object NormalizeParens { def main(args: Array[String]): Unit = { val isConll = args(1) == "conll" - val pw = new PrintWriter(new FileWriter(args(0) + ".parens")) - for(line <- Source.fromFile(args(0)).getLines()){ - if(line.trim.isEmpty) { - pw.println(line) - } else { - val tokens = line.split("\\s+") - if(isConll) { - assert(tokens.length > 3) - tokens(1) = norm(tokens(1)) - tokens(2) = norm(tokens(2)) - pw.println(tokens.mkString("\t")) + Using.resource(new PrintWriter(args(0) + ".parens")) { pw => + for (line <- Source.fromFile(args(0)).getLines()) { + if (line.trim.isEmpty) { + pw.println(line) } else { - assert(tokens.length == 2) - pw.println(norm(tokens(0)) + "\t" + tokens(1)) + val tokens = line.split("\\s+") + if (isConll) { + assert(tokens.length > 3) + tokens(1) = norm(tokens(1)) + tokens(2) = norm(tokens(2)) + pw.println(tokens.mkString("\t")) + } else { + assert(tokens.length == 2) + pw.println(norm(tokens(0)) + "\t" + tokens(1)) + } } } } - pw.close() } def norm(s:String): String = { diff --git a/main/src/main/scala/org/clulab/sequences/SequenceTagger.scala b/main/src/main/scala/org/clulab/sequences/SequenceTagger.scala index a13ec3f62..6c902e89f 100644 --- a/main/src/main/scala/org/clulab/sequences/SequenceTagger.scala +++ b/main/src/main/scala/org/clulab/sequences/SequenceTagger.scala @@ -1,11 +1,12 @@ package org.clulab.sequences -import java.io.{BufferedReader, File, FileInputStream, InputStream} - import org.clulab.processors.{Document, Sentence} import org.clulab.struct.Counter import org.clulab.utils.Files +import java.io.{BufferedReader, File} +import scala.util.Using + /** * Trait for all sequence taggers * User: mihais @@ -27,13 +28,15 @@ trait SequenceTagger[L, F] extends Tagger[L] { def save(fn:File): Unit def loadFromFile(fn:File): Unit = { - val is = Files.loadFile(fn) - load(is) + Using.resource(Files.loadFile(fn)) { is => + load(is) + } } def loadFromResource(rn:String): Unit = { - val is = Files.loadStreamFromClasspath(rn) - load(is) + Using.resource(Files.loadStreamFromClasspath(rn)) { is => + load(is) + } } def load(is:BufferedReader): Unit diff --git a/main/src/main/scala/org/clulab/sequences/SequenceTaggerEvaluator.scala b/main/src/main/scala/org/clulab/sequences/SequenceTaggerEvaluator.scala index 641687d6e..6f1337d52 100644 --- a/main/src/main/scala/org/clulab/sequences/SequenceTaggerEvaluator.scala +++ b/main/src/main/scala/org/clulab/sequences/SequenceTaggerEvaluator.scala @@ -1,49 +1,52 @@ package org.clulab.sequences -import java.io.PrintWriter - import org.clulab.processors.Document import org.clulab.sequences.SequenceTaggerEvaluator._ +import org.clulab.utils.NullWriter import org.slf4j.{Logger, LoggerFactory} +import java.io.PrintWriter +import scala.util.Using + /** * Implements evaluation of a sequence tagger * Created by mihais on 6/8/17. */ class SequenceTaggerEvaluator[L, F] { def accuracy(tagger:SequenceTagger[L, F], docs:Iterator[Document], saveOutput:Boolean = true): Double = { - val pw:Option[PrintWriter] = - if(saveOutput) Some(new PrintWriter("output_for_conlleval.txt")) - else None - var correct = 0 - var total = 0 - for(doc <- docs; sentence <- doc.sentences) { - val goldLabels = tagger.labelExtractor(sentence) - val predLabels = tagger.classesOf(sentence) - assert(goldLabels.size == predLabels.size) - - for(i <- 0 until sentence.size) { - val tag = - if(sentence.tags.isDefined) sentence.tags.get(i) - else "X" - - if(pw.isDefined) pw.get.println(s"${sentence.words(i)} $tag ${goldLabels(i)} ${predLabels(i)}") + Using.resource( + if (saveOutput) new PrintWriter("output_for_conlleval.txt") + else new PrintWriter(new NullWriter()) + ) { pw => + var correct = 0 + var total = 0 + for (doc <- docs; sentence <- doc.sentences) { + val goldLabels = tagger.labelExtractor(sentence) + val predLabels = tagger.classesOf(sentence) + assert(goldLabels.size == predLabels.size) + + for (i <- 0 until sentence.size) { + val tag = + if (sentence.tags.isDefined) sentence.tags.get(i) + else "X" + + pw.println(s"${sentence.words(i)} $tag ${goldLabels(i)} ${predLabels(i)}") + } + pw.println() + + total += goldLabels.size + for (i <- goldLabels.indices) + if (goldLabels(i) == predLabels(i)) + correct += 1 } - if(pw.isDefined) pw.get.println() - total += goldLabels.size - for(i <- goldLabels.indices) - if(goldLabels(i) == predLabels(i)) - correct += 1 - } + if (saveOutput) + logger.info("Scorable file in the CoNLL format saved to file: output_for_conlleval.txt") - if(pw.isDefined) { - logger.info("Scorable file in the CoNLL format saved to file: output_for_conlleval.txt") - pw.get.close() + val acc = 100.0 * correct.toDouble / total + logger.info(s"Accuracy = $acc ($correct/$total)") + acc } - val acc = 100.0 * correct.toDouble / total - logger.info(s"Accuracy = $acc ($correct/$total)") - acc } } diff --git a/main/src/main/scala/org/clulab/serialization/DocumentSerializer.scala b/main/src/main/scala/org/clulab/serialization/DocumentSerializer.scala index 8047bc5f8..8016375ee 100644 --- a/main/src/main/scala/org/clulab/serialization/DocumentSerializer.scala +++ b/main/src/main/scala/org/clulab/serialization/DocumentSerializer.scala @@ -1,9 +1,5 @@ package org.clulab.serialization -import java.io._ -import scala.collection.mutable -import scala.collection.mutable.{ArrayBuffer, ListBuffer} -import scala.reflect.ClassTag import org.clulab.processors.DocumentAttachment import org.clulab.processors.DocumentAttachmentBuilderFromText import org.clulab.processors.{Document, Sentence} @@ -11,6 +7,12 @@ import org.clulab.struct._ import org.clulab.utils.Logging import org.json4s.DefaultFormats +import java.io._ +import scala.collection.mutable +import scala.collection.mutable.{ArrayBuffer, ListBuffer} +import scala.reflect.ClassTag +import scala.util.Using + /** * Saves/loads a Document to/from a stream * An important focus here is to minimize the size of the serialized Document. @@ -142,10 +144,10 @@ class DocumentSerializer extends Logging { def load(s:String, encoding:String = "UTF-8"): Document = { val is = new ByteArrayInputStream(s.getBytes(encoding)) - val r = new BufferedReader(new InputStreamReader(is)) - val doc = load(r) - r.close() - doc + Using.resource(new BufferedReader(new InputStreamReader(is))) { r => + val doc = load(r) + doc + } } private def loadText (r:BufferedReader, charCount:Int): String = { @@ -346,11 +348,9 @@ class DocumentSerializer extends Logging { def save(doc:Document, encoding:String = "UTF-8", keepText:Boolean = false): String = { val byteOutput = new ByteArrayOutputStream - val os = new PrintWriter(byteOutput) - save(doc, os, keepText) - os.flush() - os.close() - byteOutput.close() + Using.resource(new PrintWriter(byteOutput)) { os => + save(doc, os, keepText) + } byteOutput.toString(encoding) } diff --git a/main/src/main/scala/org/clulab/serialization/json/JSONSerializer.scala b/main/src/main/scala/org/clulab/serialization/json/JSONSerializer.scala index 3bf57a935..4178aa3ab 100644 --- a/main/src/main/scala/org/clulab/serialization/json/JSONSerializer.scala +++ b/main/src/main/scala/org/clulab/serialization/json/JSONSerializer.scala @@ -74,9 +74,10 @@ object JSONSerializer { } val s = json.extract[Sentence] + val preferredSize = s.words.length // build dependencies val graphs = (json \ "graphs").extract[JObject].obj.map { case (key, json) => - key -> toDirectedGraph(json) + key -> toDirectedGraph(json, Some(preferredSize)) }.toMap s.graphs = GraphMap(graphs) // build labels @@ -88,12 +89,12 @@ object JSONSerializer { s } - def toDirectedGraph(json: JValue): DirectedGraph[String] = { + def toDirectedGraph(json: JValue, preferredSizeOpt: Option[Int] = None): DirectedGraph[String] = { val edges = (json \ "edges").extract[List[Edge[String]]] // The roots remain for backward compatibility, but they are ignored. val roots = (json \ "roots").extract[Set[Int]] - new DirectedGraph(edges) + new DirectedGraph(edges, preferredSizeOpt) } private def getStringOption(json: JValue, key: String): Option[String] = json \ key match { diff --git a/main/src/main/scala/org/clulab/serialization/json/package.scala b/main/src/main/scala/org/clulab/serialization/json/package.scala index 71412883a..27adb3fd9 100644 --- a/main/src/main/scala/org/clulab/serialization/json/package.scala +++ b/main/src/main/scala/org/clulab/serialization/json/package.scala @@ -40,7 +40,7 @@ package object json { def jsonAST: JValue = { ("edges" -> dg.edges.map(_.jsonAST)) ~ // The roots are being saved for backward compatibility and human consumption. - ("roots" -> dg.roots) + ("roots" -> dg.roots.toSeq.sorted) // If this remains a set, output order may change. } } diff --git a/main/src/main/scala/org/clulab/struct/DirectedGraph.scala b/main/src/main/scala/org/clulab/struct/DirectedGraph.scala index 0eda772e3..caf631e9d 100644 --- a/main/src/main/scala/org/clulab/struct/DirectedGraph.scala +++ b/main/src/main/scala/org/clulab/struct/DirectedGraph.scala @@ -1,10 +1,10 @@ package org.clulab.struct import org.clulab.scala.WrappedArray._ +import org.clulab.utils.Hash import scala.collection.mutable import scala.collection.mutable.{ArrayBuffer, ListBuffer} -import scala.util.hashing.MurmurHash3._ /** @@ -41,14 +41,10 @@ case class DirectedGraph[E]( * * @return a hash (Int) based on the [[edges]] */ - def equivalenceHash: Int = { - val stringCode = "org.clulab.struct.DirectedGraph" - // the seed (not counted in the length of finalizeHash) - // decided to use the class name - val h0 = stringHash(stringCode) - val h1 = mix(h0, edges.hashCode) - finalizeHash(h1, 1) - } + def equivalenceHash: Int = Hash( + Hash("org.clulab.struct.DirectedGraph"), + edges.hashCode + ) protected def computeSize(edges: List[Edge[_]]):Int = { val maxVertex = edges.foldLeft(0) { (max, edge) => math.max(max, math.max(edge.source, edge.destination)) } diff --git a/main/src/main/scala/org/clulab/struct/DirectedGraphIndex.scala b/main/src/main/scala/org/clulab/struct/DirectedGraphIndex.scala index 07cf125d8..fbed8630e 100644 --- a/main/src/main/scala/org/clulab/struct/DirectedGraphIndex.scala +++ b/main/src/main/scala/org/clulab/struct/DirectedGraphIndex.scala @@ -26,18 +26,18 @@ class DirectedGraphIndex[E]( } def addEdge(head:Int, modifier:Int, label:E): Unit = { - outgoingEdges(head) += Tuple2(modifier, label) - incomingEdges(modifier) += Tuple2(head, label) + outgoingEdges(head) += ((modifier, label)) + incomingEdges(modifier) += ((head, label)) val byLabel = edgesByName.getOrElseUpdate(label, new mutable.HashSet[(Int, Int)]()) - byLabel += Tuple2(head, modifier) + byLabel += ((head, modifier)) } def removeEdge(head:Int, modifier:Int, label:E): Unit = { - outgoingEdges(head).remove(Tuple2(modifier, label)) - incomingEdges(modifier).remove(Tuple2(head, label)) + outgoingEdges(head).remove((modifier, label)) + incomingEdges(modifier).remove((head, label)) val byLabel = edgesByName.get(label) if(byLabel.nonEmpty) { - byLabel.get.remove(Tuple2(head, modifier)) + byLabel.get.remove((head, modifier)) } } diff --git a/main/src/main/scala/org/clulab/struct/Lexicon.scala b/main/src/main/scala/org/clulab/struct/Lexicon.scala index 768060818..212918233 100644 --- a/main/src/main/scala/org/clulab/struct/Lexicon.scala +++ b/main/src/main/scala/org/clulab/struct/Lexicon.scala @@ -1,12 +1,12 @@ package org.clulab.struct -import java.io._ - import org.clulab.struct.Lexicon.logger import org.clulab.utils.Files import org.slf4j.LoggerFactory +import java.io._ import scala.Serializable +import scala.util.Using /** * Generic lexicon: maps objects of type T to Ints, both ways @@ -87,9 +87,9 @@ class Lexicon[T] extends Serializable { } def saveTo(fileName:String): Unit = { - val w = new BufferedWriter(new FileWriter(fileName)) - saveTo(w) - w.close() + Using.resource(new BufferedWriter(new FileWriter(fileName))) { w => + saveTo(w) + } } def saveTo(w:Writer): Unit = { @@ -151,10 +151,10 @@ object Lexicon { /** Loads a lexicon saved by Lexicon.saveTo */ def loadFrom[F](fileName:String):Lexicon[F] = { - val is = new BufferedReader(new FileReader(fileName)) - val lex = loadFrom[F](is) - is.close() - lex + Using.resource(new BufferedReader(new FileReader(fileName))) { is => + val lex = loadFrom[F](is) + lex + } } def loadFrom[F](r:Reader):Lexicon[F] = { diff --git a/main/src/main/scala/org/clulab/utils/Closer.scala b/main/src/main/scala/org/clulab/utils/Closer.scala deleted file mode 100644 index e7c516860..000000000 --- a/main/src/main/scala/org/clulab/utils/Closer.scala +++ /dev/null @@ -1,86 +0,0 @@ -package org.clulab.utils - -import scala.io.Source -import scala.language.implicitConversions -import scala.util.control.NonFatal - -object Closer { - - trait Releasable[Resource] { - def release(resource: Resource): Unit - } - - object Releasable { - - implicit def releasableAutoCloseable[Resource <: AutoCloseable]: Releasable[Resource] = { - new Releasable[Resource] { - def release(resource: Resource): Unit = Option(resource).foreach(_.close()) - } - } - - // In Scala 2.11, Source does not inherit from Closeable, so one has to tell Closer how to close() it. - implicit def releasableSource[Resource <: Source]: Releasable[Resource] = { - new Releasable[Resource] { - def release(resource: Resource): Unit = Option(resource).foreach(_.close()) - } - } - } - - def close[Resource: Releasable](resource: => Resource): Unit = - implicitly[Releasable[Resource]].release(resource) - - // This is so that exceptions caused during close are caught, but don't - // prevent the registration of any previous exception. - // See also https://medium.com/@dkomanov/scala-try-with-resources-735baad0fd7d. - // Others have resource: => Closeable, but I want the resource evaluated beforehand - // so that it doesn't throw an exception before there is anything to close. - // 3 here is for the number of arguments. Operator overloading doesn't handle it. - protected def autoClose3[Resource, Result](resource: Resource)(closer: () => Unit)(function: Resource => Result): Result = { - - val (result: Option[Result], exception: Option[Throwable]) = try { - (Some(function(resource)), None) - } - catch { - case exception: Throwable => (None, Some(exception)) - } - - val closeException: Option[Throwable] = Option(resource).flatMap { _ => - try { - closer() - None - } - catch { - case exception: Throwable => Some(exception) - } - } - - (exception, closeException) match { - case (None, None) => result.get - case (Some(ex), None) => throw ex - case (None, Some(ex)) => throw ex - case (Some(ex), Some(closeEx)) => (ex, closeEx) match { - case (e, NonFatal(nonfatal)) => - // Put the potentially fatal one first. - e.addSuppressed(nonfatal) - throw e - case (NonFatal(nonfatal), e) => - // Put the potentially fatal one first. - e.addSuppressed(nonfatal) - throw e - case (e, closeE) => - // On tie, put exception before closeException. - e.addSuppressed(closeE) - throw e - } - } - } - - def autoClose[Resource: Releasable, Result](resource: Resource)(function: Resource => Result): Result = - autoClose3(resource)(() => implicitly[Releasable[Resource]].release(resource))(function) - - implicit class AutoCloser[Resource: Releasable](resource: Resource) { - - def autoClose[Result](function: Resource => Result): Result = - Closer.autoClose(resource)(function) - } -} diff --git a/main/src/main/scala/org/clulab/utils/CoNLLtoSentencePerLine.scala b/main/src/main/scala/org/clulab/utils/CoNLLtoSentencePerLine.scala index e456d85c4..af6a31fa1 100644 --- a/main/src/main/scala/org/clulab/utils/CoNLLtoSentencePerLine.scala +++ b/main/src/main/scala/org/clulab/utils/CoNLLtoSentencePerLine.scala @@ -1,9 +1,9 @@ package org.clulab.utils import java.io.PrintWriter - import scala.collection.mutable.ArrayBuffer import scala.io.Source +import scala.util.Using /** * Converts the CoNLL format into the one-sentence-per-line required by our LMs @@ -14,29 +14,29 @@ import scala.io.Source object CoNLLtoSentencePerLine { def main(args: Array[String]): Unit = { assert(args.length == 2) - val source = Source.fromFile(args(0)) - val dest = new PrintWriter(args(1)) - - var words = new ArrayBuffer[String]() - var sentCount = 0 - for(line <- source.getLines()) { - val tokens = line.split("\\s+") - if(tokens.nonEmpty) { - words += tokens(0) // the first token must be the current word; we ignore all others - } else { - // reach end of a sentence + Using.resources( + Source.fromFile(args(0)), + new PrintWriter(args(1)) + ) { (source, dest) => + var words = new ArrayBuffer[String]() + var sentCount = 0 + for (line <- source.getLines()) { + val tokens = line.split("\\s+") + if (tokens.nonEmpty) { + words += tokens(0) // the first token must be the current word; we ignore all others + } else { + // reach end of a sentence + dest.println(words.mkString(" ")) + words = new ArrayBuffer[String]() + sentCount += 1 + } + } + if (words.nonEmpty) { dest.println(words.mkString(" ")) - words = new ArrayBuffer[String]() sentCount += 1 } - } - if(words.nonEmpty) { - dest.println(words.mkString(" ")) - sentCount += 1 - } - println(s"Converted $sentCount sentences.") - source.close() - dest.close() + println(s"Converted $sentCount sentences.") + } } } diff --git a/main/src/main/scala/org/clulab/utils/DependencyUtils.scala b/main/src/main/scala/org/clulab/utils/DependencyUtils.scala index 021f58923..ef2310572 100644 --- a/main/src/main/scala/org/clulab/utils/DependencyUtils.scala +++ b/main/src/main/scala/org/clulab/utils/DependencyUtils.scala @@ -73,16 +73,18 @@ object DependencyUtils { } /** - * Finds the highest node (i.e. closest to a root) in an Interval of a directed graph. If there are multiple nodes of - * the same rank, all are returned. + * Finds the minimum distance to a root from the token position. + * In some edge cases, Double.MaxValue and Double.PositiveInfinity can be returned. * - * @param span an Interval of nodes + * @param token an Interval of nodes * @param graph a directed graph containing the nodes in span - * @return the single node which is closest to the root among those in span + * @return the minimum distance to the root among those in span */ - def findHeads(span: Interval, graph: DependencyGraph): Seq[Int] = { + def distToRoot(token: Int, graph: DependencyGraph): Double = { + // println(s"distToRoot for token: $token:") + @annotation.tailrec - def countSteps(toksWithDist: List[(Int, Double)], seen: Set[Int]): Double = { + def loop(toksWithDist: List[(Int, Double)], seen: Set[Int]): Double = { // println("\tcountSteps: " + toksWithDist.mkString(", ")) toksWithDist match { @@ -94,7 +96,7 @@ object DependencyUtils { Double.MaxValue // this means the distance to the head is infinite, i.e., the head is not reachable case (tok, dist) :: rest if seen contains tok => // we already explored this token, skip - countSteps(rest, seen) + loop(rest, seen) case (tok, dist) :: rest if graph.roots contains tok => // found a root // it is the closest one because we are searching breath-first @@ -111,21 +113,48 @@ object DependencyUtils { } else { // keep looking, breadth-first val nextStep = incoming.map(i => (i, dist + 1)).toList - countSteps(rest ::: nextStep, seen + tok) + loop(rest ::: nextStep, seen + tok) } } } - // returns the distance to the closest root for a given token - def distToRoot(token: Int): Double = { - // println(s"distToRoot for token: $token:") - countSteps(List((token, 0)), Set.empty) + loop(List((token, 0)), Set.empty) + } + + /** + * Finds the optional distance to a root for the highest node in an Interval of a directed graph. + * If span is empty or no root is reachable, None is returned. + * + * @param span an Interval of nodes + * @param graph a directed graph containing the nodes in span + * @return some minimum distance to the root among those in span or None if the span is not empty or None if it is + */ + def distToRootOpt(span: Interval, graph: DependencyGraph): Option[Int] = { + if (span.isEmpty) None + else { + val distances = span.map { tokenIndex => + DependencyUtils.distToRoot(tokenIndex, graph) + } + val minDistance = distances.min + + if (minDistance == Double.MaxValue || minDistance == Double.PositiveInfinity) None + else Some(minDistance.toInt) } + } + /** + * Finds the highest node (i.e. closest to a root) in an Interval of a directed graph. If there are multiple nodes of + * the same rank, all are returned. + * + * @param span an Interval of nodes + * @param graph a directed graph containing the nodes in span + * @return the single node which is closest to the root among those in span + */ + def findHeads(span: Interval, graph: DependencyGraph): Seq[Int] = { if (span.isEmpty) Nil else { // get the distance to root for each token in span - val toksWithDist = span.map(t => (t, distToRoot(t))) + val toksWithDist = span.map(t => (t, distToRoot(t, graph))) val dists = toksWithDist.map(_._2) // return all tokens with minimum distance val minDist = dists.min diff --git a/main/src/main/scala/org/clulab/utils/FileUtils.scala b/main/src/main/scala/org/clulab/utils/FileUtils.scala index d38239ed6..2a2f5378d 100644 --- a/main/src/main/scala/org/clulab/utils/FileUtils.scala +++ b/main/src/main/scala/org/clulab/utils/FileUtils.scala @@ -1,17 +1,15 @@ package org.clulab.utils +import org.clulab.scala.WrappedArray._ + import java.io._ import java.net.URL -import java.nio.file.StandardCopyOption import java.nio.file.{Files => JFiles, Path, Paths} +import java.nio.file.StandardCopyOption import java.util.zip.ZipFile - -import org.clulab.scala.WrappedArray._ -import org.clulab.utils.Closer.AutoCloser - - -import scala.jdk.CollectionConverters._ import scala.io.Source +import scala.jdk.CollectionConverters._ +import scala.util.Using object FileUtils { def appendingPrintWriterFromFile(file: File): PrintWriter = Sinker.printWriterFromFile(file, append = true) @@ -52,13 +50,13 @@ object FileUtils { // Add FromFile as necessary. See getText below. def getCommentedTextSetFromResource(path: String): Set[String] = - Sourcer.sourceFromResource(path).autoClose { source => + Using.resource(Sourcer.sourceFromResource(path)) { source => getCommentedLinesFromSource(source).map(_.trim).toSet } // Add FromResource as necessary. See getText below, def getCommentedTextFromFile(file: File, sep: String = " "): String = - Sourcer.sourceFromFile(file).autoClose { source => + Using.resource(Sourcer.sourceFromFile(file)) { source => // These haven't been trimmed in case esp. trailing spaces are important. getCommentedLinesFromSource(source).mkString(sep) } @@ -66,39 +64,40 @@ object FileUtils { protected def getTextFromSource(source: Source): String = source.mkString def getTextFromResource(path: String): String = - Sourcer.sourceFromResource(path).autoClose { source => + Using.resource(Sourcer.sourceFromResource(path)) { source => getTextFromSource(source) } def getTextFromFile(file: File): String = - Sourcer.sourceFromFile(file).autoClose { source => + Using.resource(Sourcer.sourceFromFile(file)) { source => getTextFromSource(source) } def getTextFromFile(path: String): String = - Sourcer.sourceFromFile(new File(path)).autoClose { source => + Using.resource(Sourcer.sourceFromFile(new File(path))) { source => getTextFromSource(source) } def copyResourceToFile(src: String, dest: File): Unit = { - FileUtils.getClass.getResourceAsStream(src).autoClose { (is: InputStream) => - new FileOutputStream(dest).autoClose { (os: FileOutputStream) => - val buf = new Array[Byte](8192) - - def transfer: Boolean = { - val len = is.read(buf) - val continue = - if (len > 0) { - os.write(buf, 0, len); - true - } - else false - - continue - } - - while (transfer) {} + Using.resources( + FileUtils.getClass.getResourceAsStream(src), + new FileOutputStream(dest) + ) { (is: InputStream, os: FileOutputStream) => + val buf = new Array[Byte](8192) + + def transfer: Boolean = { + val len = is.read(buf) + val continue = + if (len > 0) { + os.write(buf, 0, len); + true + } + else false + + continue } + + while (transfer) {} } } @@ -109,14 +108,14 @@ object FileUtils { } def load[A](filename: String, classProvider: Any): A = - newClassLoaderObjectInputStream(filename, classProvider).autoClose { objectInputStream => + Using.resource(newClassLoaderObjectInputStream(filename, classProvider)) { objectInputStream => objectInputStream.readObject().asInstanceOf[A] } def load[A](bytes: Array[Byte], classProvider: Any): A = { val classLoader = classProvider.getClass.getClassLoader - new ClassLoaderObjectInputStream(classLoader, new ByteArrayInputStream(bytes)).autoClose { objectInputStream => + Using.resource(new ClassLoaderObjectInputStream(classLoader, new ByteArrayInputStream(bytes))) { objectInputStream => objectInputStream.readObject().asInstanceOf[A] } } @@ -184,7 +183,7 @@ object FileUtils { new ObjectInputStream(newBufferedInputStream(filename)) def unzip(zipPath: Path, outputPath: Path, replace: Boolean = false): Unit = { - new ZipFile(zipPath.toFile).autoClose { zipFile => + Using.resource(new ZipFile(zipPath.toFile)) { zipFile => for (entry <- zipFile.entries.asScala) { val path = outputPath.resolve(entry.getName) if (entry.isDirectory) { diff --git a/main/src/main/scala/org/clulab/utils/Files.scala b/main/src/main/scala/org/clulab/utils/Files.scala index 4909e8a1f..a661feff7 100644 --- a/main/src/main/scala/org/clulab/utils/Files.scala +++ b/main/src/main/scala/org/clulab/utils/Files.scala @@ -3,8 +3,8 @@ package org.clulab.utils import java.io._ import java.nio.charset.Charset import java.util.zip.GZIPInputStream - import scala.collection.mutable.ListBuffer +import scala.util.Using /** * File utilities @@ -98,20 +98,21 @@ object Files { deleteOnExit:Boolean = true, bufSize:Int = 131072): Unit = { val jar = new java.util.jar.JarFile(jarFileName) val entry = jar.getEntry(entryName) - val is = jar.getInputStream(entry) - val fos = new FileOutputStream(outFileName) - val buffer = new Array[Byte](bufSize) - var done = false - while(! done) { - val num = is.read(buffer, 0, bufSize) - if(num > 0) { - fos.write(buffer, 0, num) - } else { - done = true + Using.resources( + jar.getInputStream(entry), + new FileOutputStream(outFileName) + ) { (is, fos) => + val buffer = new Array[Byte](bufSize) + var done = false + while (!done) { + val num = is.read(buffer, 0, bufSize) + if (num > 0) { + fos.write(buffer, 0, num) + } else { + done = true + } } } - fos.close() - is.close() if(deleteOnExit) new File(outFileName).deleteOnExit() diff --git a/main/src/main/scala/org/clulab/utils/Hash.scala b/main/src/main/scala/org/clulab/utils/Hash.scala new file mode 100644 index 000000000..84cc63983 --- /dev/null +++ b/main/src/main/scala/org/clulab/utils/Hash.scala @@ -0,0 +1,50 @@ +package org.clulab.utils + +import scala.util.hashing.MurmurHash3 + +object Hash { + val symmetricSeed = 0xb592f7ae + + def apply(string: String): Int = stringHash(string) + + def apply(seed: Int, data: Int*): Int = { + finalizeHash(data.foldLeft(seed)(mix), data.length) + } + + // TODO: This count should probably not be used. The caller is probably messed up. + def withLast(count: Int)(seed: Int, data: Int*): Int = withLastCount(count)(seed, data) + + def withLast(seed: Int, data: Int*): Int = withLastCount(data.length)(seed, data) + + def withLastCount(count: Int)(seed: Int, data: Seq[Int]): Int = { + val iterator = data.iterator + + def loop(value: Int, remaining: Int): Int = { + val result = remaining match { + case 0 => finalizeHash(value, count) + case 1 => loop(mixLast(value, iterator.next()), 0) + case _ => loop(mix(value, iterator.next()), remaining - 1) + } + + result + } + + loop(seed, data.length) + } + + def ordered(xs: TraversableOnce[Any]): Int = orderedHash(xs) + + def unordered(xs: TraversableOnce[Any]): Int = unorderedHash(xs) + + def stringHash(x: String): Int = MurmurHash3.stringHash(x) + + def orderedHash(xs: TraversableOnce[Any]): Int = MurmurHash3.orderedHash(xs) + + def unorderedHash(xs: TraversableOnce[Any]): Int = MurmurHash3.unorderedHash(xs) + + def finalizeHash(hash: Int, length: Int): Int = MurmurHash3.finalizeHash(hash, length) + + def mix(hash: Int, data: Int): Int = MurmurHash3.mix(hash, data) + + def mixLast(hash: Int, data: Int): Int = MurmurHash3.mixLast(hash, data) +} diff --git a/main/src/main/scala/org/clulab/utils/NullWriter.scala b/main/src/main/scala/org/clulab/utils/NullWriter.scala new file mode 100644 index 000000000..9d5638804 --- /dev/null +++ b/main/src/main/scala/org/clulab/utils/NullWriter.scala @@ -0,0 +1,10 @@ +package org.clulab.utils + +import java.io.Writer + +// Java 11 has thigs built in: Writer.nullWriter() +class NullWriter extends Writer { + override def write(cbuf: Array[Char], off: Int, len: Int): Unit = () + override def flush(): Unit = () + override def close(): Unit = () +} diff --git a/main/src/main/scala/org/clulab/utils/ProcessCoNLL03.scala b/main/src/main/scala/org/clulab/utils/ProcessCoNLL03.scala index 5ecab85ea..a61f9c81c 100644 --- a/main/src/main/scala/org/clulab/utils/ProcessCoNLL03.scala +++ b/main/src/main/scala/org/clulab/utils/ProcessCoNLL03.scala @@ -4,6 +4,7 @@ import org.clulab.processors.clu.BalaurProcessor import org.clulab.sequences.{ColumnReader, Row} import java.io.PrintWriter +import scala.util.Using /** * Little utility that regenerates the POS tags and chunk labels for the CoNLL-03 dataset @@ -14,18 +15,20 @@ object ProcessCoNLL03 extends App { val proc = new BalaurProcessor() val rows = ColumnReader.readColumns(args(0)) println(s"Found ${rows.length} sentences.") - val pw = new PrintWriter(args(0) + ".reparsed") - for (row <- rows) { - val words = row.map(e => e.get(0)) - if (row.length == 1 && words(0) == "-DOCSTART-") { - saveSent(pw, row) - } else { - val doc = proc.mkDocumentFromTokens(Seq(words)) - proc.annotate(doc) - saveSent(pw, row, doc.sentences(0).tags, doc.sentences(0).chunks) + + Using.resource(new PrintWriter(args(0) + ".reparsed")) { printWriter => + for (row <- rows) { + val words = row.map(e => e.get(0)) + if (row.length == 1 && words(0) == "-DOCSTART-") { + saveSent(printWriter, row) + } + else { + val doc = proc.mkDocumentFromTokens(Seq(words)) + proc.annotate(doc) + saveSent(printWriter, row, doc.sentences(0).tags, doc.sentences(0).chunks) + } } } - pw.close() def saveSent(pw: PrintWriter, sent: Array[Row], tags: Option[Array[String]] = None, chunks: Option[Array[String]] = None): Unit = { if (tags.isDefined) { diff --git a/main/src/main/scala/org/clulab/utils/ProgressBar.scala b/main/src/main/scala/org/clulab/utils/ProgressBar.scala index dfa0b209f..493f47bd7 100644 --- a/main/src/main/scala/org/clulab/utils/ProgressBar.scala +++ b/main/src/main/scala/org/clulab/utils/ProgressBar.scala @@ -2,7 +2,7 @@ package org.clulab.utils import me.tongfei.progressbar.{ProgressBar => JProgressBar} -class ProgressBar[T](text: String, outerIterator: Iterator[T]) extends Iterable[T] { +class ProgressBar[T](text: String, outerIterator: Iterator[T]) extends Iterable[T] with AutoCloseable { val (jProgressBar, innerIterator) = { val (leftIterator, rightIterator) = outerIterator.duplicate val jProgressBar = new JProgressBar(text, leftIterator.length) @@ -14,6 +14,8 @@ class ProgressBar[T](text: String, outerIterator: Iterator[T]) extends Iterable[ // This convenience method unfortunately limits the progress bar to one traversal. def setExtraMessage(message: String): Unit = jProgressBar.setExtraMessage(message) + + override def close(): Unit = jProgressBar.close() } object ProgressBar { @@ -28,11 +30,7 @@ object ProgressBar { class ProgressBarIterator[T](jProgressBar: JProgressBar, iterator: Iterator[T]) extends Iterator[T] { override def hasNext: Boolean = { - val result = iterator.hasNext - - if (!result) - jProgressBar.close() - result + iterator.hasNext } override def next(): T = { diff --git a/main/src/main/scala/org/clulab/utils/ScienceUtils.scala b/main/src/main/scala/org/clulab/utils/ScienceUtils.scala index 33f1683ad..14e5ee534 100644 --- a/main/src/main/scala/org/clulab/utils/ScienceUtils.scala +++ b/main/src/main/scala/org/clulab/utils/ScienceUtils.scala @@ -1,13 +1,13 @@ package org.clulab.utils +import org.clulab.utils.ScienceUtils._ + import java.io.{BufferedReader, InputStreamReader} import java.nio.charset.StandardCharsets -import java.util.regex.Pattern import java.text.Normalizer -import org.clulab.utils.Closer.AutoCloser -import org.clulab.utils.ScienceUtils._ - +import java.util.regex.Pattern import scala.collection.mutable +import scala.util.Using class ScienceUtils { val unicodes:Map[Char, String] = loadUnicodes @@ -144,7 +144,7 @@ object ScienceUtils { private def loadAccents:Set[Char] = { val acf = getClass.getClassLoader.getResourceAsStream(ACCENTED_CHARACTERS) assert(acf != null, s"Failed to find resource file $ACCENTED_CHARACTERS in the classpath!") - new BufferedReader(new InputStreamReader(acf, charset)).autoClose { reader => + Using.resource(new BufferedReader(new InputStreamReader(acf, charset))) { reader => val accents = new mutable.ArrayBuffer[Char]() var done = false while(! done) { @@ -163,7 +163,7 @@ object ScienceUtils { val map = new mutable.HashMap[Char, String]() val is = getClass.getClassLoader.getResourceAsStream(UNICODE_TO_ASCII) assert(is != null, s"Failed to find resource file $UNICODE_TO_ASCII in the classpath!") - new BufferedReader(new InputStreamReader(is, charset)).autoClose { reader => + Using.resource(new BufferedReader(new InputStreamReader(is, charset))) { reader => var done = false while (!done) { var line = normalizeUnicode(reader.readLine()) diff --git a/main/src/main/scala/org/clulab/utils/Serializer.scala b/main/src/main/scala/org/clulab/utils/Serializer.scala index 09f168783..985ddc687 100644 --- a/main/src/main/scala/org/clulab/utils/Serializer.scala +++ b/main/src/main/scala/org/clulab/utils/Serializer.scala @@ -1,40 +1,35 @@ package org.clulab.utils -import org.clulab.utils.Closer.Releasable - import scala.language.implicitConversions +import scala.util.Using import java.io._ object Serializer { - def using[Resource: Releasable, Result](resource: Resource)(function: Resource => Result): Result = { - Closer.autoClose(resource)(function) - } - /** serialize object to output stream */ def save[A](obj: A, outputStream: OutputStream): Unit = { - using(new ObjectOutputStream(outputStream)) { oos => + Using.resource(new ObjectOutputStream(outputStream)) { oos => oos.writeObject(obj) } } /** serialize object to file */ def save[A](obj: A, file: File): Unit = { - using(new BufferedOutputStream(new FileOutputStream(file))) { fos => + Using.resource(new BufferedOutputStream(new FileOutputStream(file))) { fos => save(obj, fos) } } /** serialize object to file */ def save[A](obj: A, filename: String): Unit = { - using(new BufferedOutputStream(new FileOutputStream(filename))) { fos => + Using.resource(new BufferedOutputStream(new FileOutputStream(filename))) { fos => save(obj, fos) } } /** serialize object to byte array */ def save[A](obj: A): Array[Byte] = { - using(new ByteArrayOutputStream()) { baos => + Using.resource(new ByteArrayOutputStream()) { baos => save(obj, baos) baos.toByteArray } @@ -47,7 +42,7 @@ object Serializer { /* deserialize from input stream */ def load[A](inputStream: InputStream, classLoader: ClassLoader): A = { - using(new ClassLoaderObjectInputStream(classLoader, inputStream)) { ois => + Using.resource(new ClassLoaderObjectInputStream(classLoader, inputStream)) { ois => ois.readObject().asInstanceOf[A] } } @@ -59,7 +54,7 @@ object Serializer { /* deserialize from file */ def load[A](file: File, classLoader: ClassLoader): A = { - using(new BufferedInputStream(new FileInputStream(file))) { fis => + Using.resource(new BufferedInputStream(new FileInputStream(file))) { fis => load[A](fis, classLoader) } } @@ -71,7 +66,7 @@ object Serializer { /* deserialize from file */ def load[A](filename: String, classLoader: ClassLoader): A = { - using(new BufferedInputStream(new FileInputStream(filename))) { fis => + Using.resource(new BufferedInputStream(new FileInputStream(filename))) { fis => load[A](fis, classLoader) } } @@ -83,7 +78,7 @@ object Serializer { /* deserialize from byte array */ def load[A](bytes: Array[Byte], classLoader: ClassLoader): A = { - using(new ByteArrayInputStream(bytes)) { bais => + Using.resource(new ByteArrayInputStream(bytes)) { bais => load[A](bais, classLoader) } } diff --git a/main/src/main/scala/org/clulab/utils/StringUtils.scala b/main/src/main/scala/org/clulab/utils/StringUtils.scala index d9a51aa90..7d713f8a0 100644 --- a/main/src/main/scala/org/clulab/utils/StringUtils.scala +++ b/main/src/main/scala/org/clulab/utils/StringUtils.scala @@ -3,9 +3,9 @@ package org.clulab.utils import java.io.{ FileInputStream, BufferedInputStream, PrintWriter, StringWriter } import java.util.Properties import java.util.regex.Pattern - -import scala.jdk.CollectionConverters._ import scala.collection.mutable.ListBuffer +import scala.jdk.CollectionConverters._ +import scala.util.Using /** * Converts a command line to properties; and other useful String utils @@ -36,15 +36,15 @@ object StringUtils { if ((key == PROPERTIES || key == PROPS) && ! value.isEmpty) { // a props file was specified. read props from there println(s"loading props from file ${value.get}") - val is = new BufferedInputStream(new FileInputStream(value.get)) val propsFromFile = new Properties() - propsFromFile.load(is) + Using.resource(new BufferedInputStream(new FileInputStream(value.get))) { is => + propsFromFile.load(is) + } // trim all values, they may have trailing spaces for (k <- propsFromFile.keySet().asScala) { val v = propsFromFile.getProperty(k.asInstanceOf[String]).trim result.setProperty(k.asInstanceOf[String], v) } - is.close() } else { result.setProperty(key, value.getOrElse("true")) } @@ -141,9 +141,9 @@ object StringUtils { /** Format the given exception as a string and return the string. */ def exceptionToString (ex: Exception): String = { - val sw = new StringWriter - ex.printStackTrace(new PrintWriter(sw)) - sw.toString + StringUtils.viaPrintWriter { printWriter => + ex.printStackTrace(printWriter) + } } /** Generates the stem of a word, according to the Porter algorithm */ @@ -182,4 +182,13 @@ object StringUtils { after(string, string.indexOf(char), all, keep) def med(source: String, target: String): Int = MED(source, target).getDistance + + def viaPrintWriter(f: (PrintWriter) => Unit): String = { + val stringWriter = new StringWriter + + Using.resource(new PrintWriter(stringWriter)) { printWriter => + f(printWriter) + } + stringWriter.toString + } } diff --git a/main/src/main/scala/org/clulab/utils/ToEnhancedDependencies.scala b/main/src/main/scala/org/clulab/utils/ToEnhancedDependencies.scala index d90540932..0d7593f8d 100644 --- a/main/src/main/scala/org/clulab/utils/ToEnhancedDependencies.scala +++ b/main/src/main/scala/org/clulab/utils/ToEnhancedDependencies.scala @@ -21,6 +21,8 @@ import scala.collection.mutable.{ArrayBuffer, ListBuffer} * Date: 8/1/17 */ object ToEnhancedDependencies { + type EdgeSpec = (Int, Int, String) + def generateStanfordEnhancedDependencies(sentence:Sentence, dg:DirectedGraph[String]): DirectedGraph[String] = { val dgi = dg.toDirectedGraphIndex() collapsePrepositionsStanford(sentence, dgi) @@ -49,7 +51,7 @@ object ToEnhancedDependencies { * Replicates nmod_* accross conj dependencies * economic decline has led to violence and displacement => nmod_to from "led" to both "violence" and "displacement" */ - def replicateCollapsedNmods(collapsedNmods: Seq[(Int, Int, String)], + def replicateCollapsedNmods(collapsedNmods: Seq[EdgeSpec], dgi: DirectedGraphIndex[String]): Unit = { for(nmod <- collapsedNmods) { val conjs = dgi.findByHeadAndName(nmod._2, "conj") @@ -138,9 +140,9 @@ object ToEnhancedDependencies { def collapsePrepositionsUniversal( sentence:Sentence, - dgi:DirectedGraphIndex[String]): Seq[(Int, Int, String)] = { + dgi:DirectedGraphIndex[String]): Seq[EdgeSpec] = { - val collapsedNmods = new ArrayBuffer[(Int, Int, String)]() + val collapsedNmods = new ArrayBuffer[EdgeSpec]() collapsePrepositionsUniversalNmodCase(sentence, dgi, collapsedNmods) collapsePrepositionsUniversalDueTo(sentence, dgi, collapsedNmods) collapsedNmods @@ -155,7 +157,7 @@ object ToEnhancedDependencies { def collapsePrepositionsUniversalNmodCase( sentence:Sentence, dgi:DirectedGraphIndex[String], - collapsedNmods: ArrayBuffer[(Int, Int, String)]): Unit = { + collapsedNmods: ArrayBuffer[EdgeSpec]): Unit = { val toRemove = new ListBuffer[Edge[String]] var shouldRemove = false @@ -169,7 +171,8 @@ object ToEnhancedDependencies { // TODO: add nmod:agent (if word == "by") and passive voice here? dgi.addEdge(prep.source, prep.destination, s"nmod_$mwe") - collapsedNmods += Tuple3(prep.source, prep.destination, s"nmod_$mwe") + val edgeSpec = (prep.source, prep.destination, s"nmod_$mwe") + collapsedNmods += edgeSpec shouldRemove = true } } @@ -186,7 +189,7 @@ object ToEnhancedDependencies { def collapsePrepositionsUniversalDueTo( sentence:Sentence, dgi:DirectedGraphIndex[String], - collapsedNmods: ArrayBuffer[(Int, Int, String)]): Unit = { + collapsedNmods: ArrayBuffer[EdgeSpec]): Unit = { val toRemove = new ListBuffer[Edge[String]] var shouldRemove = false @@ -205,7 +208,8 @@ object ToEnhancedDependencies { // found the dep from "due" to "drought" val destination = rightDep.destination dgi.addEdge(source, destination, label) - collapsedNmods += Tuple3(source, destination, label) + val edgeSpec = (source, destination, label) + collapsedNmods += edgeSpec shouldRemove = true } } diff --git a/main/src/test/scala-2.11_2.12/org/clulab/utils/TestHash.scala b/main/src/test/scala-2.11_2.12/org/clulab/utils/TestHash.scala new file mode 100644 index 000000000..984fc8be0 --- /dev/null +++ b/main/src/test/scala-2.11_2.12/org/clulab/utils/TestHash.scala @@ -0,0 +1,99 @@ +package org.clulab.utils + +import org.clulab.odin.serialization.json._ +import org.clulab.odin.{CrossSentenceMention, EventMention, RelationMention, TextBoundMention, _} +import org.clulab.processors.clu.BalaurProcessor +import org.clulab.sequences.LexiconNER +import org.clulab.struct.{DirectedGraph, Edge} + +class TestHash extends Test { + val customLexiconNer = { + val kbsAndCaseInsensitiveMatchings: Seq[(String, Boolean)] = Seq( + ("org/clulab/odinstarter/FOOD.tsv", true) + ) + val kbs = kbsAndCaseInsensitiveMatchings.map(_._1) + val caseInsensitiveMatchings = kbsAndCaseInsensitiveMatchings.map(_._2) + + LexiconNER(kbs, caseInsensitiveMatchings, None) + } + val processor = new BalaurProcessor(optionalNER = Some(customLexiconNer)) + val extractorEngine = { + val rules = FileUtils.getTextFromResource("/org/clulab/odinstarter/main.yml") + + ExtractorEngine(rules) + } + val document = processor.annotate("John eats cake.") + val mentions = extractorEngine.extractFrom(document).sortBy(_.arguments.size) + val sortedMentions = mentions.sortBy { mention => (mention.startOffset, mention.endOffset) } + val eventMention = sortedMentions.find(_.isInstanceOf[EventMention]).get.asInstanceOf[EventMention] + val otherMentions = sortedMentions.filterNot(_.eq(eventMention)) + val relationMention = eventMention.toRelationMention + val crossSentenceMention = newCrossSentenceMention(eventMention, otherMentions.head, otherMentions.last) + val allMentions = sortedMentions :+ relationMention :+ crossSentenceMention + + behavior of "Hash" + + it should "compute the expected equivalence hash for a Document" in { + val expectedHash = -1960515414 + val actualHash = document.equivalenceHash + + actualHash should be (expectedHash) + } + + def getEquivalenceHash(mention: Mention): Int = mention match { + case mention: TextBoundMention => mention.equivalenceHash + case mention: EventMention => mention.equivalenceHash + case mention: RelationMention => mention.equivalenceHash + case mention: CrossSentenceMention => mention.equivalenceHash + } + + def newCrossSentenceMention(mention: EventMention, anchor: Mention, neighbor: Mention): CrossSentenceMention = { + new CrossSentenceMention( + mention.labels, + anchor, + neighbor, + mention.arguments, + mention.document, + mention.keep, + mention.foundBy, + mention.attachments + ) + } + + it should "compute the expected equivalence hashes for Mentions" in { + val expectedHashes = Array(-1163474360, 1678747586, 308621545, 1846645205, -1357918569) + val actualHashes = allMentions.map(getEquivalenceHash) + + actualHashes should be (expectedHashes) + } + + it should "compute the expected hashCode for Mentions" in { + val expectedHashes = Array(-681771612, -254169462, -1589508928, 823771056, 1600327181) + val actualHashes = allMentions.map(_.hashCode) + + actualHashes should be(expectedHashes) + } + + it should "compute the expected hashCode for a String" in { + val expectedHash = 1077910243 + val actualHash = "supercalifragilisticexpialidocious".hashCode + + actualHash should be(expectedHash) + } + + it should "compute the expected equivalence hash for a String" in { + val expectedHash = 887441175 + val actualHash = Hash("supercalifragilisticexpialidocious") + + actualHash should be(expectedHash) + } + + it should "compute the expected equivalence hash for a DirectedGraph" in { + val expectedHash = 1945759943 + val edge = Edge(0, 1, "relation") + val directedGraph = DirectedGraph(List(edge)) + val actualHash = directedGraph.equivalenceHash + + actualHash should be (expectedHash) + } +} diff --git a/main/src/test/scala-2.13/org/clulab/utils/TestHash.scala b/main/src/test/scala-2.13/org/clulab/utils/TestHash.scala new file mode 100644 index 000000000..fa94866f0 --- /dev/null +++ b/main/src/test/scala-2.13/org/clulab/utils/TestHash.scala @@ -0,0 +1,99 @@ +package org.clulab.utils + +import org.clulab.odin.serialization.json._ +import org.clulab.odin.{CrossSentenceMention, EventMention, RelationMention, TextBoundMention, _} +import org.clulab.processors.clu.BalaurProcessor +import org.clulab.sequences.LexiconNER +import org.clulab.struct.{DirectedGraph, Edge} + +class TestHash extends Test { + val customLexiconNer = { + val kbsAndCaseInsensitiveMatchings: Seq[(String, Boolean)] = Seq( + ("org/clulab/odinstarter/FOOD.tsv", true) + ) + val kbs = kbsAndCaseInsensitiveMatchings.map(_._1) + val caseInsensitiveMatchings = kbsAndCaseInsensitiveMatchings.map(_._2) + + LexiconNER(kbs, caseInsensitiveMatchings, None) + } + val processor = new BalaurProcessor(optionalNER = Some(customLexiconNer)) + val extractorEngine = { + val rules = FileUtils.getTextFromResource("/org/clulab/odinstarter/main.yml") + + ExtractorEngine(rules) + } + val document = processor.annotate("John eats cake.") + val mentions = extractorEngine.extractFrom(document).sortBy(_.arguments.size) + val sortedMentions = mentions.sortBy { mention => (mention.startOffset, mention.endOffset) } + val eventMention = sortedMentions.find(_.isInstanceOf[EventMention]).get.asInstanceOf[EventMention] + val otherMentions = sortedMentions.filterNot(_.eq(eventMention)) + val relationMention = eventMention.toRelationMention + val crossSentenceMention = newCrossSentenceMention(eventMention, otherMentions.head, otherMentions.last) + val allMentions = sortedMentions :+ relationMention :+ crossSentenceMention + + behavior of "Hash" + + it should "compute the expected equivalence hash for a Document" in { + val expectedHash = 1145238653 + val actualHash = document.equivalenceHash + + actualHash should be (expectedHash) + } + + def getEquivalenceHash(mention: Mention): Int = mention match { + case mention: TextBoundMention => mention.equivalenceHash + case mention: EventMention => mention.equivalenceHash + case mention: RelationMention => mention.equivalenceHash + case mention: CrossSentenceMention => mention.equivalenceHash + } + + def newCrossSentenceMention(mention: EventMention, anchor: Mention, neighbor: Mention): CrossSentenceMention = { + new CrossSentenceMention( + mention.labels, + anchor, + neighbor, + mention.arguments, + mention.document, + mention.keep, + mention.foundBy, + mention.attachments + ) + } + + it should "compute the expected equivalence hashes for Mentions" in { + val expectedHashes = Array(1317064233, 418554464, 269168883, 1021871359, 1657321605) + val actualHashes = allMentions.map(getEquivalenceHash) + + actualHashes should be (expectedHashes) + } + + it should "compute the expected hashCode for Mentions" in { + val expectedHashes = Array(1493402696, -1515246319, 205797074, -1416141606, -1294266266) + val actualHashes = allMentions.map(_.hashCode) + + actualHashes should be(expectedHashes) + } + + it should "compute the expected hashCode for a String" in { + val expectedHash = 1077910243 + val actualHash = "supercalifragilisticexpialidocious".hashCode + + actualHash should be(expectedHash) + } + + it should "compute the expected equivalence hash for a String" in { + val expectedHash = 887441175 + val actualHash = Hash("supercalifragilisticexpialidocious") + + actualHash should be(expectedHash) + } + + it should "compute the expected equivalence hash for a DirectedGraph" in { + val expectedHash = 821315811 + val edge = Edge(0, 1, "relation") + val directedGraph = DirectedGraph(List(edge)) + val actualHash = directedGraph.equivalenceHash + + actualHash should be (expectedHash) + } +} diff --git a/main/src/test/scala-3/org/clulab/utils/TestHash.scala b/main/src/test/scala-3/org/clulab/utils/TestHash.scala new file mode 100644 index 000000000..fa94866f0 --- /dev/null +++ b/main/src/test/scala-3/org/clulab/utils/TestHash.scala @@ -0,0 +1,99 @@ +package org.clulab.utils + +import org.clulab.odin.serialization.json._ +import org.clulab.odin.{CrossSentenceMention, EventMention, RelationMention, TextBoundMention, _} +import org.clulab.processors.clu.BalaurProcessor +import org.clulab.sequences.LexiconNER +import org.clulab.struct.{DirectedGraph, Edge} + +class TestHash extends Test { + val customLexiconNer = { + val kbsAndCaseInsensitiveMatchings: Seq[(String, Boolean)] = Seq( + ("org/clulab/odinstarter/FOOD.tsv", true) + ) + val kbs = kbsAndCaseInsensitiveMatchings.map(_._1) + val caseInsensitiveMatchings = kbsAndCaseInsensitiveMatchings.map(_._2) + + LexiconNER(kbs, caseInsensitiveMatchings, None) + } + val processor = new BalaurProcessor(optionalNER = Some(customLexiconNer)) + val extractorEngine = { + val rules = FileUtils.getTextFromResource("/org/clulab/odinstarter/main.yml") + + ExtractorEngine(rules) + } + val document = processor.annotate("John eats cake.") + val mentions = extractorEngine.extractFrom(document).sortBy(_.arguments.size) + val sortedMentions = mentions.sortBy { mention => (mention.startOffset, mention.endOffset) } + val eventMention = sortedMentions.find(_.isInstanceOf[EventMention]).get.asInstanceOf[EventMention] + val otherMentions = sortedMentions.filterNot(_.eq(eventMention)) + val relationMention = eventMention.toRelationMention + val crossSentenceMention = newCrossSentenceMention(eventMention, otherMentions.head, otherMentions.last) + val allMentions = sortedMentions :+ relationMention :+ crossSentenceMention + + behavior of "Hash" + + it should "compute the expected equivalence hash for a Document" in { + val expectedHash = 1145238653 + val actualHash = document.equivalenceHash + + actualHash should be (expectedHash) + } + + def getEquivalenceHash(mention: Mention): Int = mention match { + case mention: TextBoundMention => mention.equivalenceHash + case mention: EventMention => mention.equivalenceHash + case mention: RelationMention => mention.equivalenceHash + case mention: CrossSentenceMention => mention.equivalenceHash + } + + def newCrossSentenceMention(mention: EventMention, anchor: Mention, neighbor: Mention): CrossSentenceMention = { + new CrossSentenceMention( + mention.labels, + anchor, + neighbor, + mention.arguments, + mention.document, + mention.keep, + mention.foundBy, + mention.attachments + ) + } + + it should "compute the expected equivalence hashes for Mentions" in { + val expectedHashes = Array(1317064233, 418554464, 269168883, 1021871359, 1657321605) + val actualHashes = allMentions.map(getEquivalenceHash) + + actualHashes should be (expectedHashes) + } + + it should "compute the expected hashCode for Mentions" in { + val expectedHashes = Array(1493402696, -1515246319, 205797074, -1416141606, -1294266266) + val actualHashes = allMentions.map(_.hashCode) + + actualHashes should be(expectedHashes) + } + + it should "compute the expected hashCode for a String" in { + val expectedHash = 1077910243 + val actualHash = "supercalifragilisticexpialidocious".hashCode + + actualHash should be(expectedHash) + } + + it should "compute the expected equivalence hash for a String" in { + val expectedHash = 887441175 + val actualHash = Hash("supercalifragilisticexpialidocious") + + actualHash should be(expectedHash) + } + + it should "compute the expected equivalence hash for a DirectedGraph" in { + val expectedHash = 821315811 + val edge = Edge(0, 1, "relation") + val directedGraph = DirectedGraph(List(edge)) + val actualHash = directedGraph.equivalenceHash + + actualHash should be (expectedHash) + } +} diff --git a/main/src/test/scala/org/clulab/TestUtils.scala b/main/src/test/scala/org/clulab/TestUtils.scala index de9c9a6e3..41dbce31c 100644 --- a/main/src/test/scala/org/clulab/TestUtils.scala +++ b/main/src/test/scala/org/clulab/TestUtils.scala @@ -1,16 +1,15 @@ package org.clulab -import java.io.File - import org.clulab.learning.RVFDatum -import org.clulab.struct.Counter - -import _root_.scala.io.Source - import org.clulab.processors.Document import org.clulab.serialization.json.JSONSerializer +import org.clulab.struct.Counter import org.json4s.jackson.JsonMethods._ +import _root_.scala.io.Source +import _root_.scala.util.Using +import java.io.File + object TestUtils { def mkRVFDatum[L](label:L, features:List[String]):RVFDatum[L, String] = { @@ -33,11 +32,9 @@ object TestUtils { * @return file contents as a String */ def readFile(path: String) = { - val stream = getClass.getClassLoader.getResourceAsStream(path) - val source = Source.fromInputStream(stream) - val data = source.mkString - source.close() - data + Using.resource(Source.fromInputStream(getClass.getClassLoader.getResourceAsStream(path))) { source => + val data = source.mkString + data + } } - } diff --git a/main/src/test/scala/org/clulab/embeddings/OldCompactWordEmbeddingMap.scala b/main/src/test/scala/org/clulab/embeddings/OldCompactWordEmbeddingMap.scala index f3ad469f5..67d34488b 100644 --- a/main/src/test/scala/org/clulab/embeddings/OldCompactWordEmbeddingMap.scala +++ b/main/src/test/scala/org/clulab/embeddings/OldCompactWordEmbeddingMap.scala @@ -1,14 +1,14 @@ package org.clulab.embeddings -import java.io._ import org.clulab.scala.WrappedArray._ -import org.clulab.utils.Closer.AutoCloser import org.clulab.utils.{ClassLoaderObjectInputStream, Sourcer} import org.slf4j.{Logger, LoggerFactory} +import java.io._ import java.nio.charset.StandardCharsets import scala.collection.immutable.HashMap import scala.collection.mutable.{HashMap => MutableHashMap, Map => MutableMap} +import scala.util.Using /** * This class and its companion object have been backported from Eidos. There it is/was an optional @@ -64,7 +64,7 @@ class OldCompactWordEmbeddingMap(buildType: OldCompactWordEmbeddingMap.BuildType // Sort the map entries (word -> row) by row and then keep just the word. val words = map.toArray.sortBy(_._2).map(_._1).mkString("\n") - new ObjectOutputStream(new BufferedOutputStream(new FileOutputStream(filename))).autoClose { objectOutputStream => + Using.resource(new ObjectOutputStream(new BufferedOutputStream(new FileOutputStream(filename)))) { objectOutputStream => // Writing is performed in two steps so that the parts can be // processed separately when read back in. objectOutputStream.writeObject(words) @@ -224,10 +224,10 @@ object OldCompactWordEmbeddingMap { } protected def loadTxt(filename: String, resource: Boolean): BuildType = { - ( - if (resource) Sourcer.sourceFromResource(filename, StandardCharsets.ISO_8859_1.toString) - else Sourcer.sourceFromFilename(filename, StandardCharsets.ISO_8859_1.toString) - ).autoClose { source => + Using.resource( + if (resource) Sourcer.sourceFromResource(filename, StandardCharsets.ISO_8859_1.toString) + else Sourcer.sourceFromFilename(filename, StandardCharsets.ISO_8859_1.toString) + ) { source => val lines = source.getLines() buildMatrix(lines) @@ -242,7 +242,7 @@ object OldCompactWordEmbeddingMap { // (map, array) // This is "unrolled" for performance purposes. - new ClassLoaderObjectInputStream(this.getClass.getClassLoader, new BufferedInputStream(new FileInputStream(filename))).autoClose { objectInputStream => + Using.resource(new ClassLoaderObjectInputStream(this.getClass.getClassLoader, new BufferedInputStream(new FileInputStream(filename)))) { objectInputStream => val map: MapType = new MutableMapType() { diff --git a/main/src/test/scala/org/clulab/embeddings/OldWordEmbeddingMap.scala b/main/src/test/scala/org/clulab/embeddings/OldWordEmbeddingMap.scala index 73c687c04..a00fa3e43 100644 --- a/main/src/test/scala/org/clulab/embeddings/OldWordEmbeddingMap.scala +++ b/main/src/test/scala/org/clulab/embeddings/OldWordEmbeddingMap.scala @@ -1,16 +1,15 @@ package org.clulab.embeddings -import org.clulab.scala.WrappedArray._ - -import java.io._ -import java.nio.{ByteBuffer, ByteOrder} - import org.apache.commons.io.{FileUtils, IOUtils} +import org.clulab.scala.WrappedArray._ import org.clulab.utils.MathUtils import org.slf4j.{Logger, LoggerFactory} +import java.io._ +import java.nio.{ByteBuffer, ByteOrder} import scala.collection.mutable.ArrayBuffer import scala.io.Source +import scala.util.Using /** * Implements similarity metrics using the embedding matrix @@ -45,13 +44,13 @@ class OldWordEmbeddingMap(matrixConstructor: Map[String, Array[Double]]) extends val dimensions: Int = matrix.values.head.length def saveMatrix(mf: String): Unit = { - val pw = new PrintWriter(mf) - pw.println(s"${matrix.size}, $dimensions") - for ((word, vec) <- matrix) { - val strRep = vec.map(v => f"$v%.6f").mkString(" ") - pw.println(s"$word $strRep") + Using.resource(new PrintWriter(mf)) { pw => + pw.println(s"${matrix.size}, $dimensions") + for ((word, vec) <- matrix) { + val strRep = vec.map(v => f"$v%.6f").mkString(" ") + pw.println(s"$word $strRep") + } } - pw.close() } /** If the word doesn't exist in the lexicon, try to use UNK */ @@ -102,7 +101,7 @@ class OldWordEmbeddingMap(matrixConstructor: Map[String, Array[Double]]) extends * Finds the words most similar to this set of inputs * IMPORTANT: words here must already be normalized using Word2vec.sanitizeWord()! */ - def mostSimilarWords(words:Set[String], howMany:Int):List[(String, Double)] = { + override def mostSimilarWords(words:Set[String], howMany:Int):List[(String, Double)] = { val v = new Array[Double](dimensions) var found = false for(w1 <- words) { @@ -451,24 +450,24 @@ object OldWordEmbeddingMap { wordsToUse: Option[Set[String]], caseInsensitiveWordsToUse:Boolean):(Map[String, Array[Double]], Int) = { logger.debug("Started to load embedding matrix from file " + mf + "...") - val src: Source = Source.fromFile(mf, "iso-8859-1") - val lines: Iterator[String] = src.getLines() - val matrix = buildMatrix(lines, wordsToUse, caseInsensitiveWordsToUse) - src.close() - logger.debug("Completed matrix loading.") - matrix + Using.resource(Source.fromFile(mf, "iso-8859-1")) { src => + val lines: Iterator[String] = src.getLines() + val matrix = buildMatrix(lines, wordsToUse, caseInsensitiveWordsToUse) + logger.debug("Completed matrix loading.") + matrix + } } private def loadMatrixFromStream(is: InputStream, wordsToUse: Option[Set[String]], caseInsensitiveWordsToUse:Boolean):(Map[String, Array[Double]], Int) = { logger.debug("Started to load embedding matrix from stream ...") - val src: Source = Source.fromInputStream(is, "iso-8859-1") - val lines: Iterator[String] = src.getLines() - val matrix = buildMatrix(lines, wordsToUse, caseInsensitiveWordsToUse) - src.close() - logger.debug("Completed matrix loading.") - matrix + Using.resource(Source.fromInputStream(is, "iso-8859-1")) { src => + val lines: Iterator[String] = src.getLines() + val matrix = buildMatrix(lines, wordsToUse, caseInsensitiveWordsToUse) + logger.debug("Completed matrix loading.") + matrix + } } private def loadMatrixFromSource(src: Source, wordsToUse: Option[Set[String]], diff --git a/main/src/test/scala/org/clulab/embeddings/TestWordEmbeddingMap.scala b/main/src/test/scala/org/clulab/embeddings/TestWordEmbeddingMap.scala index 0e8c232c2..026a86ed5 100644 --- a/main/src/test/scala/org/clulab/embeddings/TestWordEmbeddingMap.scala +++ b/main/src/test/scala/org/clulab/embeddings/TestWordEmbeddingMap.scala @@ -1,10 +1,10 @@ package org.clulab.embeddings -import org.clulab.utils.Closer.AutoCloser import org.clulab.utils.InputStreamer import org.clulab.utils.Test import java.io.File +import scala.util.Using class TestWordEmbeddingMap extends Test { val name = "/test_vectors" @@ -150,7 +150,7 @@ class TestWordEmbeddingMap extends Test { val start = System.currentTimeMillis() val inputStreamer = new InputStreamer() val inputStream = inputStreamer.getFileAsStream(fileName + InputStreamer.txtExtension) - val glove = inputStream.autoClose { inputStream => + val glove = Using.resource(inputStream) { inputStream => ExplicitWordEmbeddingMap(inputStream, false) } val stop = System.currentTimeMillis() @@ -162,7 +162,7 @@ class TestWordEmbeddingMap extends Test { val start = System.currentTimeMillis() val inputStreamer = new InputStreamer() val inputStream = inputStreamer.getFileAsStream(fileName + InputStreamer.binExtension) - val glove = inputStream.autoClose { inputStream => + val glove = Using.resource(inputStream) { inputStream => CompactWordEmbeddingMap(inputStream, true) } val stop = System.currentTimeMillis() @@ -176,7 +176,7 @@ class TestWordEmbeddingMap extends Test { val start = System.currentTimeMillis() val inputStreamer = new InputStreamer() val inputStream = inputStreamer.getResourceAsStream(resourceName + InputStreamer.txtExtension) - val glove = inputStream.autoClose { inputStream => + val glove = Using.resource(inputStream) { inputStream => ExplicitWordEmbeddingMap(inputStream, false) } val stop = System.currentTimeMillis() @@ -188,7 +188,7 @@ class TestWordEmbeddingMap extends Test { val start = System.currentTimeMillis() val inputStreamer = new InputStreamer() val inputStream = inputStreamer.getResourceAsStream(resourceName + InputStreamer.binExtension) - val glove = inputStream.autoClose { inputStream => + val glove = Using.resource(inputStream) { inputStream => CompactWordEmbeddingMap(inputStream, true) } val stop = System.currentTimeMillis() diff --git a/main/src/test/scala/org/clulab/learning/TestSVMRankingClassifier.scala b/main/src/test/scala/org/clulab/learning/TestSVMRankingClassifier.scala index 44f81b9d8..0c5c861e2 100644 --- a/main/src/test/scala/org/clulab/learning/TestSVMRankingClassifier.scala +++ b/main/src/test/scala/org/clulab/learning/TestSVMRankingClassifier.scala @@ -4,11 +4,11 @@ import org.clulab.utils.Test import java.io.{File, PrintWriter} import org.scalatest._ - import scala.collection.mutable.{ArrayBuffer, ListBuffer} import scala.io.Source import scala.sys.process._ import scala.util.Try +import scala.util.Using object NeedsExternalBinary extends Tag("NeedsExternalBinary") @@ -55,9 +55,9 @@ class TestSVMRankingClassifier extends Test { // // let's make sure we get the same values as svm_rank_classify - val pw = new PrintWriter("./test.dat") - classifier.mkTestFile(pw, qid3, 1) - pw.close() + Using.resource(new PrintWriter("./test.dat")) { pw => + classifier.mkTestFile(pw, qid3, 1) + } val exitCode = "svm_rank_classify ./test.dat ./model.dat ./predictions".! exitCode should be (0) diff --git a/main/src/test/scala/org/clulab/numeric/TestNumericEntityRecognition.scala b/main/src/test/scala/org/clulab/numeric/TestNumericEntityRecognition.scala index e886592bc..bde4bb817 100644 --- a/main/src/test/scala/org/clulab/numeric/TestNumericEntityRecognition.scala +++ b/main/src/test/scala/org/clulab/numeric/TestNumericEntityRecognition.scala @@ -4,7 +4,7 @@ import org.clulab.processors.Sentence import org.clulab.processors.clu.BalaurProcessor import org.clulab.processors.clu.tokenizer.Tokenizer import org.clulab.struct.Interval -import org.clulab.utils.Test +import org.clulab.utils.{Test, Timer} import org.scalatest.concurrent.TimeLimits import org.scalatest.time.{Seconds, Span} @@ -256,7 +256,12 @@ class TestNumericEntityRecognition extends Test { ensure("autumn in 2017", Interval(0, 3), "DATE-RANGE", "2017-09-22 -- 2017-12-21") ensure("2017 autumn", Interval(0, 2), "DATE-RANGE", "2017-09-22 -- 2017-12-21") ensure("winter", Interval(0, 1), "DATE-RANGE", "XXXX-12-21 -- XXXX-03-20") - ensure("spring", Interval(0, 1), "DATE-RANGE", "XXXX-03-20 -- XXXX-06-21") + ensure("autumn", Interval(0, 1), "DATE-RANGE", "XXXX-09-22 -- XXXX-12-21") +// ensure("spring", Interval(0, 1), "DATE-RANGE", "XXXX-03-20 -- XXXX-06-21") // alice: failing this test is an expected behavior as raw spring/fall is now filtered out by postprocessNumericEntities (filtering out homonyms of spring/falls) + ensure("fall 2021", Interval(0, 2), "DATE-RANGE", "2021-09-22 -- 2021-12-21") + ensure("in the fall", Interval(2, 3), "DATE-RANGE", "XXXX-09-22 -- XXXX-12-21") + ensure("fall", Interval(0, 1), "", "") + ensure("spring", Interval(0, 1), "", "") } it should "recognize date ranges with seasons" in { @@ -324,9 +329,11 @@ class TestNumericEntityRecognition extends Test { // ensure("drier season between November and March", Interval(2, 8), "DATE-RANGE", "XXXX-11-XX -- XXXX-03-XX") // ensure("flooding are expected to occur in July to August 2021", Interval(5, 10), "DATE-RANGE", "2021-07-XX -- 2021-08-XX") ensure("farmers sowed Jaya between 20 June and 1 July", Interval(3, 8), "DATE-RANGE", "XXXX-06-20 -- XXXX-07-01") - - // TODO: It would be interesting to handle such dates ranges 1st week of July: "XXXX-07-01 -- XXXX-07-07 - // ensure(sentence= "transplanted during the 1st week of July", Interval(3, 7), goldEntity= "DATE", goldNorm= "XXXX-07-01") + ensure(sentence= "transplanted during the 1st week of July", Interval(3, 7), goldEntity= "DATE-RANGE", goldNorm= "XXXX-07-01 -- XXXX-07-07") + ensure(sentence= "We planted corn the first two weeks of April.", Interval(4, 9), goldEntity= "DATE-RANGE", goldNorm= "XXXX-04-01 -- XXXX-04-14") + ensure(sentence= "We planted beans the second week of May.", Interval(4, 8), goldEntity= "DATE-RANGE", goldNorm= "XXXX-05-08 -- XXXX-05-14") + ensure(sentence= "We planted beans in the last week of June.", Interval(5, 9), goldEntity= "DATE-RANGE", goldNorm= "XXXX-06-24 -- XXXX-06-30") + ensure(sentence= "We planted beans in the last two weeks of February.", Interval(5, 10), goldEntity= "DATE-RANGE", goldNorm= "XXXX-02-15 -- XXXX-02-28") } it should "recognize weird date ranges" in { @@ -408,6 +415,12 @@ class TestNumericEntityRecognition extends Test { // ensure(sentence= "on 18th of Oct 2019", Interval(1, 5), goldEntity= "DATE", goldNorm= "2019-10-18") // ensure(sentence= "old seedlings transplanted on 14 July in 1999/00", Interval(4, 8), goldEntity= "DATE", goldNorm= "2000-07-14") } + + it should "recognize season in year" in { + ensure(sentence = "We applied it in summer in 21", Interval(4, 7), goldEntity= "DATE-RANGE", goldNorm = "XX21-06-21 -- XX21-09-22") + ensure(sentence = "We applied it in Fall in 21", Interval(4, 7), goldEntity= "DATE-RANGE", goldNorm = "XX21-09-22 -- XX21-12-21") + ensure(sentence = "We applied it in fall of 2021", Interval(4, 7), goldEntity= "DATE-RANGE", goldNorm = "2021-09-22 -- 2021-12-21") + } it should "recognize measurement units" in { ensure("It was 12 ha", Interval(2, 4), "MEASUREMENT-AREA", "12.0 ha") @@ -586,10 +599,14 @@ class TestNumericEntityRecognition extends Test { it should "not hang" in { val text = "others 1,016 960 250 80 150 1,300 50 1,200 50 700 2,300 3,800 225 800 2 150 200 3,691 7,160 3 130 1,480 1,136 2,515 300 130 875 1,050 30 365400 3,775 Total 2487 3,450 8,575 825 19 112 Source : LM 12 / Saed The SSF 2020/2021 campaign is timidly being set up on the entire left bank of the Senegal River with the establishment of nurseries ." + val timer = new Timer("Keith") - TimeLimits.failAfter(Span(20, Seconds)) { - numericParse(text) + timer.time { +// TimeLimits.failAfter(Span(25, Seconds)) { + numericParse(text) +// } } + println(s"Keith says: ${timer.elapsedToString()}") } // diff --git a/main/src/test/scala/org/clulab/numeric/TestSeasonNormalizer.scala b/main/src/test/scala/org/clulab/numeric/TestSeasonNormalizer.scala index 6ea3c2c4e..2bad85f6e 100644 --- a/main/src/test/scala/org/clulab/numeric/TestSeasonNormalizer.scala +++ b/main/src/test/scala/org/clulab/numeric/TestSeasonNormalizer.scala @@ -37,6 +37,7 @@ class TestSeasonNormalizer extends Test { seasonEntities shouldNot contain (iDateRange) seasonNorms shouldNot contain (fallDateRange) seasonNorms shouldNot contain (seasonDateRange) + } behavior of "Custom seasonal BalaurProcessor" @@ -56,5 +57,7 @@ class TestSeasonNormalizer extends Test { seasonEntities should contain (iDateRange) seasonNorms shouldNot contain (fallDateRange) seasonNorms should contain (seasonDateRange) + } -} + +} \ No newline at end of file diff --git a/main/src/test/scala/org/clulab/odin/TestMention.scala b/main/src/test/scala/org/clulab/odin/TestMention.scala index 263de4ee9..1a87d925b 100644 --- a/main/src/test/scala/org/clulab/odin/TestMention.scala +++ b/main/src/test/scala/org/clulab/odin/TestMention.scala @@ -1,30 +1,69 @@ package org.clulab.odin import org.clulab.TestUtils.jsonStringToDocument +import org.clulab.struct.Interval import org.clulab.utils.Test class TestMention extends Test { + val rule = + """ + |rules: + | - name: test + | type: token + | label: TestMention + | pattern: | + | [lemma=I] []* [lemma=dance] + |""".stripMargin + + val ee = ExtractorEngine(rule) + + behavior of "mention.text" // motivated by changes to the words field that replaced `'m` with `am` - "mention.text" should "properly reconstruct the original span" in { + it should "properly reconstruct the original span" in { // I'm going to dance val json = """{"sentences":[{"words":["I","am","going","to","dance","."],"startOffsets":[0,1,4,10,13,18],"endOffsets":[1,3,9,12,18,19],"raw":["I","'m","going","to","dance","."],"tags":["PRP","VBP","VBG","TO","VB","."],"lemmas":["I","be","go","to","dance","."],"entities":["O","O","O","O","O","O"],"norms":["O","O","O","O","O","O"],"chunks":["B-NP","B-VP","I-VP","I-VP","I-VP","O"],"graphs":{"universal-enhanced":{"edges":[{"source":2,"destination":0,"relation":"nsubj"},{"source":2,"destination":1,"relation":"aux"},{"source":2,"destination":4,"relation":"xcomp"},{"source":2,"destination":5,"relation":"punct"},{"source":4,"destination":0,"relation":"nsubj:xsubj"},{"source":4,"destination":3,"relation":"mark"}],"roots":[2]},"universal-basic":{"edges":[{"source":2,"destination":0,"relation":"nsubj"},{"source":2,"destination":1,"relation":"aux"},{"source":2,"destination":4,"relation":"xcomp"},{"source":2,"destination":5,"relation":"punct"},{"source":4,"destination":3,"relation":"mark"}],"roots":[2]}}}]}""" val doc = jsonStringToDocument(json) - val rule = - """ - |rules: - | - name: test - | type: token - | label: TestMention - | pattern: | - | [lemma=I] []* [lemma=dance] - |""".stripMargin - - val ee = ExtractorEngine(rule) val mentions = ee.extractFrom(doc) - mentions should have length(1) + mentions should have length (1) mentions.head.text shouldBe "I'm going to dance" + val head = mentions.head.synHead + mentions.head.distToRootOpt shouldBe (Some(0)) } + behavior of "Mention.getRootDistOpt" + + it should "get None when there are no roots" in { + // 2 is wrapped to 2 once here so that it isn't a root. + val json = """{ + |"sentences":[{ + | "words":["I","am","going","to","dance","."], + | "startOffsets":[0,1,4,10,13,18], + | "endOffsets":[1,3,9,12,18,19], + | "raw":["I","'m","going","to","dance","."], + | "tags":["PRP","VBP","VBG","TO","VB","."], + | "lemmas":["I","be","go","to","dance","."], + | "entities":["O","O","O","O","O","O"], + | "norms":["O","O","O","O","O","O"], + | "chunks":["B-NP","B-VP","I-VP","I-VP","I-VP","O"], + | "graphs":{ + | "universal-enhanced":{"edges":[],"roots":[]}, + | "universal-basic":{"edges":[],"roots":[]}} + | }] + |}""" + .stripMargin + val doc = jsonStringToDocument(json) + val mention = ee.extractFrom(doc).head + + mention.distToRootOpt shouldBe (Some(0)) + } + + it should "get None when the Interval is empty" in { + val json = """{"sentences":[{"words":["I","am","going","to","dance","."],"startOffsets":[0,1,4,10,13,18],"endOffsets":[1,3,9,12,18,19],"raw":["I","'m","going","to","dance","."],"tags":["PRP","VBP","VBG","TO","VB","."],"lemmas":["I","be","go","to","dance","."],"entities":["O","O","O","O","O","O"],"norms":["O","O","O","O","O","O"],"chunks":["B-NP","B-VP","I-VP","I-VP","I-VP","O"],"graphs":{"universal-enhanced":{"edges":[{"source":2,"destination":0,"relation":"nsubj"},{"source":2,"destination":1,"relation":"aux"},{"source":2,"destination":4,"relation":"xcomp"},{"source":2,"destination":5,"relation":"punct"},{"source":4,"destination":0,"relation":"nsubj:xsubj"},{"source":4,"destination":3,"relation":"mark"}],"roots":[2]},"universal-basic":{"edges":[{"source":2,"destination":0,"relation":"nsubj"},{"source":2,"destination":1,"relation":"aux"},{"source":2,"destination":4,"relation":"xcomp"},{"source":2,"destination":5,"relation":"punct"},{"source":4,"destination":3,"relation":"mark"}],"roots":[2]}}}]}""" + val doc = jsonStringToDocument(json) + val mention = ee.extractFrom(doc).head.asInstanceOf[TextBoundMention].copy(tokenInterval = Interval(0, 0)) + + mention.distToRootOpt shouldBe (None) + } } diff --git a/main/src/test/scala/org/clulab/odin/TestVariables.scala b/main/src/test/scala/org/clulab/odin/TestVariables.scala index d02cf3e27..9d934b843 100644 --- a/main/src/test/scala/org/clulab/odin/TestVariables.scala +++ b/main/src/test/scala/org/clulab/odin/TestVariables.scala @@ -1,18 +1,18 @@ package org.clulab.odin -import scala.io.Source - import org.clulab.TestUtils._ import org.clulab.utils.Test +import scala.io.Source +import scala.util.Using class TestVariables extends Test { def readResource(filename: String): String = { - val source = Source.fromURL(getClass.getResource(filename)) - val data = source.mkString - source.close() - data + Using.resource(Source.fromURL(getClass.getResource(filename))) { source => + val data = source.mkString + data + } } "variables" should "allow for whitespace" in { diff --git a/main/src/test/scala/org/clulab/odin/serialization/TestSerializer.scala b/main/src/test/scala/org/clulab/odin/serialization/TestSerializer.scala index 5e7cb6bbb..4a2a8767c 100644 --- a/main/src/test/scala/org/clulab/odin/serialization/TestSerializer.scala +++ b/main/src/test/scala/org/clulab/odin/serialization/TestSerializer.scala @@ -4,6 +4,8 @@ import org.clulab.TestUtils.jsonStringToDocument import org.clulab.odin.ExtractorEngine import org.clulab.utils.Test +import scala.util.Using + // See TestJSONSerializer for the test upon which this is based. class TestSerializer extends Test { @@ -12,9 +14,10 @@ class TestSerializer extends Test { def serialize(anyOut: Any): Boolean = { val streamOut = new ByteArrayOutputStream() - val encoder = new ObjectOutputStream(streamOut) - encoder.writeObject(anyOut) - + Using.resource(new ObjectOutputStream(streamOut)) { encoder => + encoder.writeObject(anyOut) + } + val bytes = streamOut.toByteArray val streamIn = new ByteArrayInputStream(bytes) val decoder = new ObjectInputStream(streamIn) { @@ -28,8 +31,10 @@ class TestSerializer extends Test { } } } - val anyIn = decoder.readObject() - decoder.close() + val anyIn = Using.resource(decoder) { decoder => + decoder.readObject() + } + anyIn == anyOut } } diff --git a/main/src/test/scala/org/clulab/processors/TestDepGraphSizes.scala b/main/src/test/scala/org/clulab/processors/TestDepGraphSizes.scala index 40da03473..54ed13cc1 100644 --- a/main/src/test/scala/org/clulab/processors/TestDepGraphSizes.scala +++ b/main/src/test/scala/org/clulab/processors/TestDepGraphSizes.scala @@ -2,10 +2,10 @@ package org.clulab.processors import org.clulab.struct.DirectedGraph -/** Makes sure that CluProcessor produces dependency graphs of correct sizes */ +/** Makes sure that Processor produces dependency graphs of correct sizes */ class TestDepGraphSizes extends CluTest { - "CluProcessor" should "produce dependency graphs that have the same size as the sentence" in { + "Processor" should "produce dependency graphs that have the same size as the sentence" in { // Document 3 // val text = "Raise fertility on \n\n" // Document 11 diff --git a/main/src/test/scala/org/clulab/processors/TestDueTo.scala b/main/src/test/scala/org/clulab/processors/TestDueTo.scala index 98c272eb0..41f4433b2 100644 --- a/main/src/test/scala/org/clulab/processors/TestDueTo.scala +++ b/main/src/test/scala/org/clulab/processors/TestDueTo.scala @@ -28,7 +28,7 @@ class TestDueTo extends CluTest { "Rorer Group Inc. will report that third-quarter profit rose more than 15% from a year earlier, though the gain is wholly due to asset sales, Robert Cawthorn, chairman, president and chief executive officer, said.", "Mr. Cawthorn said the profit growth in the latest quarter was due to the sale of two Rorer drugs.", "Although this widow earns only twice the minimum wage, largely due to the earnings limit, she would have to earn an additional $4,930 to offset her catastrophic surtax of $496.", - "Past Colombian government tolerance of the \"narcotraficantes\" was due to the drug lords' history of wiping out leftists in the hinterlands.", + "Past Colombian government tolerance of the \"narcotraficantes\" was due to the drug lords' history of wiping out leftists in the hinterlands." // , // due X to // "As for joint ventures, Mr. Houghton said profit was \"essentially flat\" due primarily to a slow recovery at Samsung-Corning Co. in Korea following a strike at a major customer and the disruption of shipments to China.", diff --git a/main/src/test/scala/org/clulab/processors/TestLemmatizer.scala b/main/src/test/scala/org/clulab/processors/TestLemmatizer.scala index 89d0e12be..13f858d4e 100644 --- a/main/src/test/scala/org/clulab/processors/TestLemmatizer.scala +++ b/main/src/test/scala/org/clulab/processors/TestLemmatizer.scala @@ -1,6 +1,6 @@ package org.clulab.processors -import org.clulab.utils.{FileUtils, Sourcer} +import org.clulab.utils.FileUtils class TestLemmatizer extends CluTest { diff --git a/main/src/test/scala/org/clulab/processors/TestLexiconNER.scala b/main/src/test/scala/org/clulab/processors/TestLexiconNER.scala index 6e057b707..60491b049 100644 --- a/main/src/test/scala/org/clulab/processors/TestLexiconNER.scala +++ b/main/src/test/scala/org/clulab/processors/TestLexiconNER.scala @@ -1,14 +1,9 @@ package org.clulab.processors +import org.clulab.sequences.LexiconNER import org.clulab.sequences.FileOverrideKbSource import org.clulab.sequences.FileStandardKbSource import org.clulab.sequences.LexicalVariations - -import java.io.ByteArrayInputStream -import java.io.ByteArrayOutputStream -import java.io.ObjectInputStream -import java.io.ObjectOutputStream -import org.clulab.sequences.LexiconNER import org.clulab.sequences.MemoryOverrideKbSource import org.clulab.sequences.MemoryStandardKbSource import org.clulab.sequences.NoLexicalVariations @@ -16,11 +11,15 @@ import org.clulab.sequences.ResourceOverrideKbSource import org.clulab.sequences.ResourceStandardKbSource import org.clulab.struct.EntityValidator import org.clulab.struct.TrueEntityValidator -import org.clulab.utils.Closer.AutoCloser import org.clulab.utils.SeqOdometer +import java.io.ByteArrayInputStream +import java.io.ByteArrayOutputStream import java.io.File +import java.io.ObjectInputStream +import java.io.ObjectOutputStream import scala.collection.mutable +import scala.util.Using class TestLexiconNER extends CluTest { @@ -36,14 +35,14 @@ class TestLexiconNER extends CluTest { def serialize(value: Any): Array[Byte] = { val byteArrayOutputStream = new ByteArrayOutputStream() - new ObjectOutputStream(byteArrayOutputStream).autoClose { objectOutputStream => + Using.resource(new ObjectOutputStream(byteArrayOutputStream)) { objectOutputStream => objectOutputStream.writeObject(value) } byteArrayOutputStream.toByteArray } def deserialize(bytes: Array[Byte]): Any = { - val ner = new ObjectInputStream(new ByteArrayInputStream(bytes)).autoClose { objectInputStream => + val ner = Using.resource(new ObjectInputStream(new ByteArrayInputStream(bytes))) { objectInputStream => objectInputStream.readObject } @@ -341,7 +340,7 @@ class TestLexiconNER extends CluTest { def serialize(entityValidator: EntityValidator): Array[Byte] = { val byteArrayOutputStream = new ByteArrayOutputStream() - new ObjectOutputStream(byteArrayOutputStream).autoClose { objectOutputStream => + Using.resource(new ObjectOutputStream(byteArrayOutputStream)) { objectOutputStream => objectOutputStream.writeObject(entityValidator) } byteArrayOutputStream.toByteArray diff --git a/main/src/test/scala/org/clulab/processors/TestMkCombinedDocument.scala b/main/src/test/scala/org/clulab/processors/TestMkCombinedDocument.scala index d69741f75..bdb8f7796 100644 --- a/main/src/test/scala/org/clulab/processors/TestMkCombinedDocument.scala +++ b/main/src/test/scala/org/clulab/processors/TestMkCombinedDocument.scala @@ -3,13 +3,12 @@ package org.clulab.processors import org.clulab.processors.clu.BalaurProcessor import org.clulab.scala.WrappedArray._ import org.clulab.serialization.DocumentSerializer -import org.clulab.utils.Closer.AutoCloser -import org.clulab.utils.{Sourcer, Test} +import org.clulab.utils.{Sourcer, StringUtils, Test} -import java.io.{PrintWriter, StringWriter} +import scala.util.Using class TestMkCombinedDocument extends Test { - val sentences = Sourcer.sourceFromFilename("./main/src/test/resources/org/clulab/processors/sentences10.txt").autoClose { source => + val sentences = Using.resource(Sourcer.sourceFromFilename("./main/src/test/resources/org/clulab/processors/sentences10.txt")) { source => source.getLines().toArray } val manySentenceLengths = Array( @@ -29,12 +28,9 @@ class TestMkCombinedDocument extends Test { val processor = new BalaurProcessor() def toString(document: Document): String = { - val stringWriter = new StringWriter() - - new PrintWriter(stringWriter).autoClose { printWriter => + StringUtils.viaPrintWriter { printWriter => documentSerializer.save(document, printWriter, keepText = true) } - stringWriter.toString } behavior of "mkCombinedDocument" diff --git a/main/src/test/scala/org/clulab/processors/TestParentheses.scala b/main/src/test/scala/org/clulab/processors/TestParentheses.scala index c6d0c0f6d..b74ae96d4 100644 --- a/main/src/test/scala/org/clulab/processors/TestParentheses.scala +++ b/main/src/test/scala/org/clulab/processors/TestParentheses.scala @@ -9,7 +9,7 @@ package org.clulab.processors */ class TestParentheses extends CluTest { - "CluProcessor" should "tokenize, lemmatize, and POS tag parentheses correctly" in { + "Processor" should "tokenize, lemmatize, and POS tag parentheses correctly" in { // TODO: add back when we have a POS tagger /* val doc = proc.mkDocument("Moreover, in von Willebrand factor-stimulated platelets, the tyrosine phosphorylation of pp60(c-src) is closely associated with the activation of phosphatidylinositol 3-kinase (PIK), and two adhesion receptors, glycoprotein (Gp)Ib and GpIIb/IIIa(alpha-IIb-beta(3)), are involved. ") diff --git a/main/src/test/scala/org/clulab/processors/TestCluProcessor.scala b/main/src/test/scala/org/clulab/processors/TestProcessor.scala similarity index 98% rename from main/src/test/scala/org/clulab/processors/TestCluProcessor.scala rename to main/src/test/scala/org/clulab/processors/TestProcessor.scala index 616ba56b5..9cf3652d4 100644 --- a/main/src/test/scala/org/clulab/processors/TestCluProcessor.scala +++ b/main/src/test/scala/org/clulab/processors/TestProcessor.scala @@ -1,13 +1,13 @@ package org.clulab.processors /** - * Unit tests for CluProcessor + * Unit tests for BalaurProcessor * User: mihais * Date: 6/17/17 */ -class TestCluProcessor extends CluTest { +class TestProcessor extends CluTest { - "CluProcessor" should "tokenize raw text correctly" in { + "Processor" should "tokenize raw text correctly" in { val doc = proc.mkDocument("John Doe went to China. There, he visited Beijing.") doc.clear() diff --git a/main/src/test/scala/org/clulab/processors/TestUniversalEnhancedDependencies.scala b/main/src/test/scala/org/clulab/processors/TestUniversalEnhancedDependencies.scala index 59f278ccb..c9d6fbbba 100644 --- a/main/src/test/scala/org/clulab/processors/TestUniversalEnhancedDependencies.scala +++ b/main/src/test/scala/org/clulab/processors/TestUniversalEnhancedDependencies.scala @@ -2,7 +2,7 @@ package org.clulab.processors class TestUniversalEnhancedDependencies extends CluTest { - "CluProcessor" should "parse some basic sentences correctly" in { + "Processor" should "parse some basic sentences correctly" in { var doc = proc.annotate("Ras1 is associated with cancer.") // TODO: this should be nsubjpass (once we have a model trained on Genia) doc.sentences.head.universalBasicDependencies.get.hasEdge(2, 0, "nsubj") should be(true) diff --git a/main/src/test/scala/org/clulab/processors/apps/ExtractSentencesApp.scala b/main/src/test/scala/org/clulab/processors/apps/ExtractSentencesApp.scala index 5361a342a..782abfd0b 100644 --- a/main/src/test/scala/org/clulab/processors/apps/ExtractSentencesApp.scala +++ b/main/src/test/scala/org/clulab/processors/apps/ExtractSentencesApp.scala @@ -1,9 +1,10 @@ package org.clulab.processors.apps import org.clulab.processors.clu.BalaurProcessor -import org.clulab.utils.Closer.AutoCloser import org.clulab.utils.FileUtils +import scala.util.Using + object ExtractSentencesApp extends App { val directoryName = args.lift(0).getOrElse("../corpora/Doc16k/txt") val fileName = args.lift(1).getOrElse("sentences.txt") @@ -12,7 +13,7 @@ object ExtractSentencesApp extends App { val processor = new BalaurProcessor() var count = 0 - FileUtils.printWriterFromFile(fileName).autoClose { printWriter => + Using.resource(FileUtils.printWriterFromFile(fileName)) { printWriter => files.foreach { file => val text = FileUtils.getTextFromFile(file) val document = processor.mkDocument(text, keepText = true) diff --git a/main/src/test/scala/org/clulab/processors/apps/TokenClassifierTimerApp.scala b/main/src/test/scala/org/clulab/processors/apps/TokenClassifierTimerApp.scala index 67cfa8f6e..56b40fdb4 100644 --- a/main/src/test/scala/org/clulab/processors/apps/TokenClassifierTimerApp.scala +++ b/main/src/test/scala/org/clulab/processors/apps/TokenClassifierTimerApp.scala @@ -3,6 +3,7 @@ package org.clulab.processors.apps import org.clulab.processors.clu.BalaurProcessor import org.clulab.utils.{Sourcer, Timers} +import scala.util.Using object TokenClassifierTimerApp extends App { val fileName = args.lift(0).getOrElse("../sentences.txt") @@ -13,11 +14,10 @@ object TokenClassifierTimerApp extends App { processor } val lines = { - val source = Sourcer.sourceFromFilename(fileName) - val lines = source.getLines().take(100).toArray - - source.close - lines + Using.resource(Sourcer.sourceFromFilename(fileName)) { source => + val lines = source.getLines().take(100).toArray + lines + } } val elapsedTimer = Timers.getOrNew("Elapsed") diff --git a/main/src/test/scala/org/clulab/struct/TestCounter.scala b/main/src/test/scala/org/clulab/struct/TestCounter.scala index 8d3cfe54e..140d6de46 100644 --- a/main/src/test/scala/org/clulab/struct/TestCounter.scala +++ b/main/src/test/scala/org/clulab/struct/TestCounter.scala @@ -1,9 +1,6 @@ package org.clulab.struct -import java.io.{BufferedWriter, PrintWriter, StringWriter} - -import org.clulab.utils.Files -import org.clulab.utils.Test +import org.clulab.utils.{StringUtils, Test} /** * Tests Counter methods @@ -12,16 +9,15 @@ import org.clulab.utils.Test */ class TestCounter extends Test { "TestCounter" should "serialize content correctly in saveTo " in { - val sw = new StringWriter() - val w = Files.toPrintWriter(sw) - val c = new Counter[String]() - c += "uno" - c += "dos" - c.saveTo(w) - w.close() + val string = StringUtils.viaPrintWriter { printWriter => + val c = new Counter[String]() + c += "uno" + c += "dos" + c.saveTo(printWriter) + } val eol = System.getProperty("line.separator") - val content = sw.toString.replace(eol, " ") + val content = string.replace(eol, " ") val values = content.split(' ') val Array(defaultReturnValue, size, kind) = values.take(3) diff --git a/main/src/test/scala/org/clulab/struct/TestDocumentAttachment.scala b/main/src/test/scala/org/clulab/struct/TestDocumentAttachment.scala index 91a6c8f3a..b84e337a3 100644 --- a/main/src/test/scala/org/clulab/struct/TestDocumentAttachment.scala +++ b/main/src/test/scala/org/clulab/struct/TestDocumentAttachment.scala @@ -1,10 +1,5 @@ package org.clulab.struct -import java.io.ByteArrayInputStream -import java.io.ByteArrayOutputStream -import java.io.ObjectInputStream -import java.io.ObjectOutputStream - import org.clulab.processors.Document import org.clulab.processors.Sentence import org.clulab.serialization.DocumentSerializer @@ -13,12 +8,17 @@ import org.clulab.struct.test.CaseClass import org.clulab.struct.test.ObjectNameDocumentAttachment import org.clulab.struct.test.NameDocumentAttachment import org.clulab.struct.test.TextNameDocumentAttachment -import org.clulab.utils.Closer.AutoCloser import org.clulab.utils.Test import org.json4s.jackson.parseJson import org.json4s.jackson.prettyJson import org.json4s.jackson.renderJValue +import java.io.ByteArrayInputStream +import java.io.ByteArrayOutputStream +import java.io.ObjectInputStream +import java.io.ObjectOutputStream +import scala.util.Using + class TestDocumentAttachment extends Test { protected val FIRST_KEY = "first" protected val MIDDLE_KEY = "middle" @@ -31,8 +31,8 @@ class TestDocumentAttachment extends Test { protected val ALIAS_NAME = "Alias" def serialize(any: Any): Array[Byte] = { - new ByteArrayOutputStream().autoClose { byteArrayOutputStream => - new ObjectOutputStream(byteArrayOutputStream).autoClose { objectOutputStream => + Using.resource(new ByteArrayOutputStream()) { byteArrayOutputStream => + Using.resource(new ObjectOutputStream(byteArrayOutputStream)) { objectOutputStream => try { objectOutputStream.writeObject(any) } @@ -47,8 +47,8 @@ class TestDocumentAttachment extends Test { } def deserialize[T](byteArray: Array[Byte]): T = { - new ByteArrayInputStream(byteArray).autoClose { byteArrayInputStream => - new ObjectInputStream(byteArrayInputStream).autoClose { objectInputStream => + Using.resource(new ByteArrayInputStream(byteArray)) { byteArrayInputStream => + Using.resource(new ObjectInputStream(byteArrayInputStream)) { objectInputStream => try { val res1 = objectInputStream.readObject() val res2 = res1.asInstanceOf[T] diff --git a/main/src/test/scala/org/clulab/utils/TestAutoClosing.scala b/main/src/test/scala/org/clulab/utils/TestAutoClosing.scala index 6e5ed9564..c1bba4e96 100644 --- a/main/src/test/scala/org/clulab/utils/TestAutoClosing.scala +++ b/main/src/test/scala/org/clulab/utils/TestAutoClosing.scala @@ -1,10 +1,10 @@ package org.clulab.utils -import org.clulab.utils.Closer.AutoCloser import org.scalatest._ import java.io.Closeable import scala.io.Source +import scala.util.Using class TestAutoClosing extends Test { @@ -22,7 +22,7 @@ class TestAutoClosing extends Test { it should "be able to produce a simple result" in { val closing = new Closing() - val result = closing.autoClose { _ => + val result = Using.resource(closing) { _ => 5 } result should be (5) @@ -31,7 +31,7 @@ class TestAutoClosing extends Test { it should "be able to produce a null result" in { val closing = new Closing() - val result: AnyRef = closing.autoClose { _ => + val result: AnyRef = Using.resource(closing) { _ => null } @@ -41,7 +41,7 @@ class TestAutoClosing extends Test { it should "be able to produce a None result" in { val closing = new Closing() - val result = closing.autoClose { _ => + val result = Using.resource(closing) { _ => None } result should be (None) @@ -50,7 +50,7 @@ class TestAutoClosing extends Test { it should "be able to produce a Some result" in { val closing = new Closing() - val result = closing.autoClose { _ => + val result = Using.resource(closing) { _ => Some(5) } result should be (Some(5)) @@ -61,7 +61,7 @@ class TestAutoClosing extends Test { val closing = new Closing() an [IllegalStateException] should be thrownBy { - closing.autoClose(_ => throw new IllegalStateException("Boom!")) + Using.resource(closing)(_ => throw new IllegalStateException("Boom!")) } closing.closed should be (true) } @@ -70,7 +70,7 @@ class TestAutoClosing extends Test { val closing = new Closing() an [StackOverflowError] should be thrownBy { - closing.autoClose(_ => throw new StackOverflowError("Boom!")) + Using.resource(closing)(_ => throw new StackOverflowError("Boom!")) } closing.closed should be (true) } @@ -79,7 +79,7 @@ class TestAutoClosing extends Test { val closing = new Closing(Some(new IllegalStateException("Boom!"))) an [IllegalStateException] should be thrownBy { - closing.autoClose(_ => "Hello") + Using.resource(closing)(_ => "Hello") } closing.closed should be (true) } @@ -88,7 +88,7 @@ class TestAutoClosing extends Test { val closing = new Closing(Some(new StackOverflowError("Boom!"))) an [StackOverflowError] should be thrownBy { - closing.autoClose(_ => "Hello") + Using.resource(closing)(_ => "Hello") } closing.closed should be (true) } @@ -97,7 +97,7 @@ class TestAutoClosing extends Test { val closing = new Closing(Some(new IllegalStateException("Boom!"))) an [RuntimeException] should be thrownBy { - closing.autoClose(_ => throw new RuntimeException("Boom!")) + Using.resource(closing)(_ => throw new RuntimeException("Boom!")) } closing.closed should be (true) } @@ -106,7 +106,7 @@ class TestAutoClosing extends Test { val closing = new Closing(Some(new OutOfMemoryError("Boom!"))) an [StackOverflowError] should be thrownBy { - closing.autoClose(_ => throw new StackOverflowError("Boom!")) + Using.resource(closing)(_ => throw new StackOverflowError("Boom!")) } closing.closed should be (true) } @@ -115,7 +115,7 @@ class TestAutoClosing extends Test { val closing = new Closing(Some(new IllegalStateException("Boom!"))) an [StackOverflowError] should be thrownBy { - closing.autoClose(_ => throw new StackOverflowError("Boom!")) + Using.resource(closing)(_ => throw new StackOverflowError("Boom!")) } closing.closed should be (true) } @@ -124,7 +124,7 @@ class TestAutoClosing extends Test { val closing = new Closing(Some(new OutOfMemoryError("Boom!"))) an [OutOfMemoryError] should be thrownBy { - closing.autoClose(_ => throw new IllegalStateException("Boom!")) + Using.resource(closing)(_ => throw new IllegalStateException("Boom!")) } closing.closed should be (true) } @@ -137,13 +137,13 @@ class TestAutoClosing extends Test { } an [RuntimeException] should be thrownBy { - getClosing.autoClose( _ => 5) + Using.resource(getClosing)( _ => 5) } closing.closed should be (false) } it should "work with a plain Source, even in Scala 2.11" in { - Source.fromString("foo\nbar\n").autoClose { source => + Using.resource(Source.fromString("foo\nbar\n")) { source => source.getLines().toList } } diff --git a/main/src/test/scala/org/clulab/utils/TestClosing.scala b/main/src/test/scala/org/clulab/utils/TestClosing.scala index e9903692e..4ac7ff1b3 100644 --- a/main/src/test/scala/org/clulab/utils/TestClosing.scala +++ b/main/src/test/scala/org/clulab/utils/TestClosing.scala @@ -4,6 +4,7 @@ import org.scalatest._ import java.io.Closeable import scala.io.Source +import scala.util.Using class TestClosing extends Test { @@ -21,7 +22,7 @@ class TestClosing extends Test { it should "be able to produce a simple result" in { val closing = new Closing() - val result = Closer.autoClose(closing) { _ => + val result = Using.resource(closing) { _ => 5 } result should be (5) @@ -30,7 +31,7 @@ class TestClosing extends Test { it should "be able to produce a null result" in { val closing = new Closing() - val result: AnyRef = Closer.autoClose(closing) { _ => + val result: AnyRef = Using.resource(closing) { _ => null } @@ -40,7 +41,7 @@ class TestClosing extends Test { it should "be able to produce a None result" in { val closing = new Closing() - val result = Closer.autoClose(closing) { _ => + val result = Using.resource(closing) { _ => None } result should be (None) @@ -49,7 +50,7 @@ class TestClosing extends Test { it should "be able to produce a Some result" in { val closing = new Closing() - val result = Closer.autoClose(closing) { _ => + val result = Using.resource(closing) { _ => Some(5) } result should be (Some(5)) @@ -60,7 +61,7 @@ class TestClosing extends Test { val closing = new Closing() an [IllegalStateException] should be thrownBy { - Closer.autoClose(closing)(_ => throw new IllegalStateException("Boom!")) + Using.resource(closing)(_ => throw new IllegalStateException("Boom!")) } closing.closed should be (true) } @@ -69,7 +70,7 @@ class TestClosing extends Test { val closing = new Closing() an [StackOverflowError] should be thrownBy { - Closer.autoClose(closing)(_ => throw new StackOverflowError("Boom!")) + Using.resource(closing)(_ => throw new StackOverflowError("Boom!")) } closing.closed should be (true) } @@ -78,7 +79,7 @@ class TestClosing extends Test { val closing = new Closing(Some(new IllegalStateException("Boom!"))) an [IllegalStateException] should be thrownBy { - Closer.autoClose(closing)(_ => "Hello") + Using.resource(closing)(_ => "Hello") } closing.closed should be (true) } @@ -87,7 +88,7 @@ class TestClosing extends Test { val closing = new Closing(Some(new StackOverflowError("Boom!"))) an [StackOverflowError] should be thrownBy { - Closer.autoClose(closing)(_ => "Hello") + Using.resource(closing)(_ => "Hello") } closing.closed should be (true) } @@ -96,7 +97,7 @@ class TestClosing extends Test { val closing = new Closing(Some(new IllegalStateException("Boom!"))) an [RuntimeException] should be thrownBy { - Closer.autoClose(closing)(_ => throw new RuntimeException("Boom!")) + Using.resource(closing)(_ => throw new RuntimeException("Boom!")) } closing.closed should be (true) } @@ -105,7 +106,7 @@ class TestClosing extends Test { val closing = new Closing(Some(new OutOfMemoryError("Boom!"))) an [StackOverflowError] should be thrownBy { - Closer.autoClose(closing)(_ => throw new StackOverflowError("Boom!")) + Using.resource(closing)(_ => throw new StackOverflowError("Boom!")) } closing.closed should be (true) } @@ -114,7 +115,7 @@ class TestClosing extends Test { val closing = new Closing(Some(new IllegalStateException("Boom!"))) an [StackOverflowError] should be thrownBy { - Closer.autoClose(closing)(_ => throw new StackOverflowError("Boom!")) + Using.resource(closing)(_ => throw new StackOverflowError("Boom!")) } closing.closed should be (true) } @@ -123,7 +124,7 @@ class TestClosing extends Test { val closing = new Closing(Some(new OutOfMemoryError("Boom!"))) an [OutOfMemoryError] should be thrownBy { - Closer.autoClose(closing)(_ => throw new IllegalStateException("Boom!")) + Using.resource(closing)(_ => throw new IllegalStateException("Boom!")) } closing.closed should be (true) } @@ -136,13 +137,13 @@ class TestClosing extends Test { } an [RuntimeException] should be thrownBy { - Closer.autoClose(getClosing)( _ => 5) + Using.resource(getClosing)( _ => 5) } closing.closed should be (false) } it should "work with a plain Source, even in Scala 2.11" in { val source = Source.fromString("foo\nbar\n") - Closer.close(source) + source.close() } } diff --git a/main/src/test/scala/org/clulab/utils/TestCrLf.scala b/main/src/test/scala/org/clulab/utils/TestCrLf.scala index 8332c4635..935ebca3b 100644 --- a/main/src/test/scala/org/clulab/utils/TestCrLf.scala +++ b/main/src/test/scala/org/clulab/utils/TestCrLf.scala @@ -4,10 +4,7 @@ import java.io.BufferedInputStream import java.io.File import java.io.FileInputStream import java.io.InputStreamReader - -import org.clulab.utils.Closer.AutoCloser - -import org.scalatest._ +import scala.util.Using class TestCrLf extends Test { @@ -24,7 +21,7 @@ class TestCrLf extends Test { ), Sourcer.utf8 ) - val hasCrLf = inputReader.autoClose { inputReader => + val hasCrLf = Using.resource(inputReader) { inputReader => var hasCrLf = false var endedWithCr = false diff --git a/main/src/test/scala/org/clulab/utils/TestDependencyUtils.scala b/main/src/test/scala/org/clulab/utils/TestDependencyUtils.scala index c9ae98525..d7fb2ce6e 100644 --- a/main/src/test/scala/org/clulab/utils/TestDependencyUtils.scala +++ b/main/src/test/scala/org/clulab/utils/TestDependencyUtils.scala @@ -15,7 +15,10 @@ class TestDependencyUtils extends Test { "failure of Gab1 to bind p85, and potentially recruit Shp2, would influence levels of EGFR autophosphorylation." val doc1 = jsonStringToDocument(""" {"sentences":[{"words":["Because","the","substrates","of","Shp2","are","for","the","most","part","unknown",",","we","were","additionally","interested","in","examining","the","state","of","EGFR","tyrosine","phosphorylation","following","treatment","with","EGF","in","order","to","determine","if","the","failure","of","Gab1","to","bind","p85",",","and","potentially","recruit","Shp2",",","would","influence","levels","of","EGFR","autophosphorylation","."],"startOffsets":[0,8,12,23,26,31,35,39,43,48,53,60,62,65,70,83,94,97,107,111,117,120,125,134,150,160,170,175,179,182,188,191,201,204,208,216,219,224,227,232,235,237,241,253,261,265,267,273,283,290,293,298,317],"endOffsets":[7,11,22,25,30,34,38,42,47,52,60,61,64,69,82,93,96,106,110,116,119,124,133,149,159,169,174,178,181,187,190,200,203,207,215,218,223,226,231,235,236,240,252,260,265,266,272,282,289,292,297,317,318],"tags":["IN","DT","NNS","IN","NN","VBP","IN","DT","JJS","NN","JJ",",","PRP","VBD","RB","JJ","IN","VBG","DT","NN","IN","NN","NN","NN","VBG","NN","IN","NN","IN","NN","TO","VB","IN","DT","NN","IN","NN","TO","VB","NN",",","CC","RB","VB","NN",",","MD","VB","NNS","IN","NN","NN","."],"lemmas":["because","the","substrate","of","shp2","be","for","the","most","part","unknown",",","we","be","additionally","interested","in","examine","the","state","of","egfr","tyrosine","phosphorylation","follow","treatment","with","egf","in","order","to","determine","if","the","failure","of","gab1","to","bind","p85",",","and","potentially","recruit","shp2",",","would","influence","level","of","egfr","autophosphorylation","."],"entities":["O","O","O","O","B-Gene_or_gene_product","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","B-Gene_or_gene_product","B-Site","O","O","O","O","B-Gene_or_gene_product","O","O","O","O","O","O","O","O","B-Gene_or_gene_product","O","O","B-Family","O","O","O","O","B-Gene_or_gene_product","O","O","O","O","O","B-Gene_or_gene_product","O","O"],"chunks":["B-SBAR","B-NP","I-NP","B-PP","B-NP","B-VP","B-PP","B-NP","I-NP","I-NP","B-ADJP","O","B-NP","B-VP","B-ADJP","I-ADJP","B-PP","B-VP","B-NP","I-NP","B-PP","B-NP","I-NP","I-NP","I-NP","I-NP","B-PP","B-NP","B-SBAR","O","B-VP","I-VP","B-SBAR","B-NP","I-NP","B-PP","B-NP","B-VP","I-VP","B-NP","O","O","B-VP","I-VP","B-NP","O","B-VP","I-VP","B-NP","B-PP","B-NP","I-NP","O"],"graphs":{"universal-basic":{"edges":[{"source":2,"destination":1,"relation":"det"},{"source":2,"destination":3,"relation":"prep"},{"source":3,"destination":4,"relation":"pobj"},{"source":5,"destination":0,"relation":"mark"},{"source":5,"destination":2,"relation":"nsubj"},{"source":5,"destination":6,"relation":"prep"},{"source":6,"destination":9,"relation":"pobj"},{"source":9,"destination":7,"relation":"det"},{"source":9,"destination":8,"relation":"amod"},{"source":9,"destination":10,"relation":"amod"},{"source":15,"destination":31,"relation":"advcl"},{"source":15,"destination":16,"relation":"prep"},{"source":15,"destination":5,"relation":"advcl"},{"source":15,"destination":12,"relation":"nsubj"},{"source":15,"destination":13,"relation":"cop"},{"source":15,"destination":14,"relation":"advmod"},{"source":16,"destination":17,"relation":"pcomp"},{"source":17,"destination":19,"relation":"dobj"},{"source":19,"destination":18,"relation":"det"},{"source":19,"destination":20,"relation":"prep"},{"source":20,"destination":23,"relation":"pobj"},{"source":23,"destination":21,"relation":"nn"},{"source":23,"destination":22,"relation":"nn"},{"source":23,"destination":24,"relation":"prep"},{"source":24,"destination":25,"relation":"pobj"},{"source":25,"destination":26,"relation":"prep"},{"source":26,"destination":27,"relation":"pobj"},{"source":31,"destination":47,"relation":"advcl"},{"source":31,"destination":28,"relation":"mark"},{"source":31,"destination":29,"relation":"dep"},{"source":31,"destination":30,"relation":"aux"},{"source":34,"destination":33,"relation":"det"},{"source":34,"destination":35,"relation":"prep"},{"source":35,"destination":36,"relation":"pobj"},{"source":36,"destination":38,"relation":"vmod"},{"source":38,"destination":37,"relation":"aux"},{"source":38,"destination":39,"relation":"dobj"},{"source":38,"destination":41,"relation":"cc"},{"source":38,"destination":43,"relation":"conj"},{"source":43,"destination":42,"relation":"advmod"},{"source":43,"destination":44,"relation":"dobj"},{"source":47,"destination":32,"relation":"mark"},{"source":47,"destination":48,"relation":"dobj"},{"source":47,"destination":34,"relation":"nsubj"},{"source":47,"destination":46,"relation":"aux"},{"source":48,"destination":49,"relation":"prep"},{"source":49,"destination":51,"relation":"pobj"},{"source":51,"destination":50,"relation":"nn"}],"roots":[15]},"universal-enhanced":{"edges":[{"source":2,"destination":1,"relation":"det"},{"source":2,"destination":4,"relation":"prep_of"},{"source":5,"destination":0,"relation":"mark"},{"source":5,"destination":2,"relation":"nsubj"},{"source":5,"destination":9,"relation":"prep_for"},{"source":9,"destination":7,"relation":"det"},{"source":9,"destination":8,"relation":"amod"},{"source":9,"destination":10,"relation":"amod"},{"source":15,"destination":31,"relation":"advcl"},{"source":15,"destination":17,"relation":"prepc_in"},{"source":15,"destination":5,"relation":"advcl"},{"source":15,"destination":12,"relation":"nsubj"},{"source":15,"destination":13,"relation":"cop"},{"source":15,"destination":14,"relation":"advmod"},{"source":17,"destination":19,"relation":"dobj"},{"source":19,"destination":18,"relation":"det"},{"source":19,"destination":23,"relation":"prep_of"},{"source":23,"destination":21,"relation":"nn"},{"source":23,"destination":22,"relation":"nn"},{"source":23,"destination":25,"relation":"prep_following"},{"source":25,"destination":27,"relation":"prep_with"},{"source":31,"destination":47,"relation":"advcl"},{"source":31,"destination":28,"relation":"mark"},{"source":31,"destination":29,"relation":"dep"},{"source":31,"destination":30,"relation":"aux"},{"source":34,"destination":33,"relation":"det"},{"source":34,"destination":36,"relation":"prep_of"},{"source":36,"destination":38,"relation":"vmod"},{"source":36,"destination":43,"relation":"vmod"},{"source":38,"destination":37,"relation":"aux"},{"source":38,"destination":39,"relation":"dobj"},{"source":38,"destination":43,"relation":"conj_and"},{"source":43,"destination":42,"relation":"advmod"},{"source":43,"destination":44,"relation":"dobj"},{"source":47,"destination":32,"relation":"mark"},{"source":47,"destination":48,"relation":"dobj"},{"source":47,"destination":34,"relation":"nsubj"},{"source":47,"destination":46,"relation":"aux"},{"source":48,"destination":51,"relation":"prep_of"},{"source":51,"destination":50,"relation":"nn"}],"roots":[15]}}}]} """) val sent1 = doc1.sentences.head - text1 should "produce 'substrates' as the head of 'the substrates of Shp2'" in { + + behavior of text1 + + it should "produce 'substrates' as the head of 'the substrates of Shp2'" in { // 3 is a root, so be sure to avoid it in the former interval (1, 5). val result = findHeadStrict(Interval(1, 3), sent1) result shouldBe defined @@ -34,7 +37,10 @@ class TestDependencyUtils extends Test { val text2 = "The docking protein Gab1 is the primary mediator of EGF-stimulated activation of the PI-3K/Akt cell survival pathway" val doc2 = jsonStringToDocument(""" {"sentences":[{"words":["The","docking","protein","Gab1","is","the","primary","mediator","of","EGF","stimulated","activation","of","the","PI-3K","and","Akt","cell","survival","pathway"],"startOffsets":[0,4,12,20,25,28,32,40,49,52,56,67,78,81,85,90,91,95,100,109],"endOffsets":[3,11,19,24,27,31,39,48,51,55,66,77,80,84,90,91,94,99,108,116],"tags":["DT","NN","NN","NN","VBZ","DT","JJ","NN","IN","NN","VBD","NN","IN","DT","NN","CC","NN","NN","NN","NN"],"lemmas":["the","docking","protein","gab1","be","the","primary","mediator","of","egf","stimulate","activation","of","the","pi-3k","and","akt","cell","survival","pathway"],"entities":["O","B-Family","O","B-Gene_or_gene_product","O","O","O","O","O","B-Gene_or_gene_product","O","O","O","O","B-Gene_or_gene_product","O","B-Gene_or_gene_product","B-BioProcess","I-BioProcess","O"],"chunks":["B-NP","I-NP","I-NP","I-NP","B-VP","B-NP","I-NP","I-NP","B-PP","B-NP","I-NP","I-NP","B-PP","B-NP","I-NP","I-NP","I-NP","I-NP","I-NP","I-NP"],"graphs":{"universal-basic":{"edges":[{"source":3,"destination":0,"relation":"det"},{"source":3,"destination":1,"relation":"nn"},{"source":3,"destination":2,"relation":"nn"},{"source":7,"destination":3,"relation":"nsubj"},{"source":7,"destination":4,"relation":"cop"},{"source":7,"destination":5,"relation":"det"},{"source":7,"destination":6,"relation":"amod"},{"source":7,"destination":8,"relation":"prep"},{"source":8,"destination":9,"relation":"pobj"},{"source":10,"destination":19,"relation":"nsubj"},{"source":10,"destination":7,"relation":"dep"},{"source":10,"destination":11,"relation":"dobj"},{"source":11,"destination":12,"relation":"prep"},{"source":12,"destination":14,"relation":"pobj"},{"source":14,"destination":15,"relation":"cc"},{"source":14,"destination":18,"relation":"conj"},{"source":14,"destination":13,"relation":"det"},{"source":18,"destination":16,"relation":"nn"},{"source":18,"destination":17,"relation":"nn"}],"roots":[10]},"universal-enhanced":{"edges":[{"source":3,"destination":0,"relation":"det"},{"source":3,"destination":1,"relation":"nn"},{"source":3,"destination":2,"relation":"nn"},{"source":7,"destination":3,"relation":"nsubj"},{"source":7,"destination":4,"relation":"cop"},{"source":7,"destination":5,"relation":"det"},{"source":7,"destination":6,"relation":"amod"},{"source":7,"destination":9,"relation":"prep_of"},{"source":10,"destination":19,"relation":"nsubj"},{"source":10,"destination":7,"relation":"dep"},{"source":10,"destination":11,"relation":"dobj"},{"source":11,"destination":18,"relation":"prep_of"},{"source":11,"destination":14,"relation":"prep_of"},{"source":14,"destination":18,"relation":"conj_and"},{"source":14,"destination":13,"relation":"det"},{"source":18,"destination":16,"relation":"nn"},{"source":18,"destination":17,"relation":"nn"}],"roots":[10]}}}]} """) val sent2 = doc2.sentences.head - text2 should "have the same getHeadStrict as roots" in { + + behavior of text2 + + it should "have the same getHeadStrict as roots" in { val head = findHeadStrict(Interval(0, 20), sent2).get // There are multiple, unsorted roots, so a simple head is not sufficient. val roots = sent2.dependencies.get.roots @@ -68,20 +74,29 @@ class TestDependencyUtils extends Test { val text3 = "." val doc3 = jsonStringToDocument(""" {"sentences":[{"words":["."],"startOffsets":[0],"endOffsets":[1],"tags":["."],"lemmas":["."],"entities":["O"],"norms":["O"],"chunks":["O"],"graphs":{"universal-basic":{"edges":[],"roots":[0]},"universal-enhanced":{"edges":[],"roots":[0]}}}]} """) val sent3 = doc3.sentences.head - text3 should "produce one head using findHeads" in { - findHeads(Interval(0, 1), sent3.dependencies.get) should have size (1) + + behavior of text3 + + it should "produce one head using findHeads" in { + val heads = findHeads(Interval(0, 1), sent3.dependencies.get) + + heads should have size (1) + distToRootOpt(Interval(0, 1), sent3.dependencies.get) should be (Some(0)) } - text3 should "produce no heads using findHeadsStrict" in { + it should "produce no heads using findHeadsStrict" in { findHeadsStrict(Interval(0, 1), sent3) should have size (0) } - "DependencyUtils" should "handle cycles in the dependencyGraph correctly" in { + behavior of "DependencyUtils" + + it should "handle cycles in the dependencyGraph correctly" in { val edges = List((1, 0, "det"),(1,3,"rcmod"),(3,1,"nsubj"),(3,6,"prep_at"),(6,5,"nn"), (8,1,"nsubj"),(8,7,"advmod"),(8,12,"dobj"),(8,20,"prep_in"),(12,9,"det"),(12,10,"nn"), (12,11,"nn"),(12,13,"partmod"),(13,16,"prep_for"),(16,15,"nn"),(20,19,"amod")) val depGraph = new DirectedGraph[String](DirectedGraph.triplesToEdges[String](edges)) val tokenInterval = Interval(0, 2) noException shouldBe thrownBy (DependencyUtils.findHeads(tokenInterval, depGraph)) + distToRootOpt(tokenInterval, depGraph) should be (Some(1)) } it should "handle roots with incoming dependencies" in { @@ -94,6 +109,7 @@ class TestDependencyUtils extends Test { val graph = DirectedGraph(DirectedGraph.triplesToEdges[String](edges)) val interval = Interval(4, 8) noException shouldBe thrownBy (DependencyUtils.findHeads(interval, graph)) + distToRootOpt(interval, graph) should be (Some(0)) } // this test comes from sentence 23556 in file /data/nlp/corpora/agiga/data/xml/afp_eng_199405.xml.gz @@ -110,6 +126,7 @@ class TestDependencyUtils extends Test { val graph = DirectedGraph(DirectedGraph.triplesToEdges[String](edges)) val interval = Interval(21, 23) noException shouldBe thrownBy (DependencyUtils.findHeads(interval, graph)) + distToRootOpt(interval, graph) should be (Some(1)) } } diff --git a/main/src/test/scala/org/clulab/utils/TestPrintUtils.scala b/main/src/test/scala/org/clulab/utils/TestPrintUtils.scala index 4f68fff2e..66e3e1fa4 100644 --- a/main/src/test/scala/org/clulab/utils/TestPrintUtils.scala +++ b/main/src/test/scala/org/clulab/utils/TestPrintUtils.scala @@ -2,8 +2,6 @@ package org.clulab.utils import org.clulab.utils.PrintUtils._ -import java.io.{PrintWriter, StringWriter} - class TestPrintUtils extends Test { val int = 5 val string = "hello" @@ -15,20 +13,11 @@ class TestPrintUtils extends Test { behavior of "PrintUtils" - def withPrintWriter(f: PrintWriter => Unit): String = { - val stringWriter = new StringWriter - val printWriter = new PrintWriter(stringWriter) - - f(printWriter) - printWriter.close() - stringWriter.toString - } - it should "print with no arguments" in { def test(any: Any, expectedResult: String): Unit = { - val standardResult = withPrintWriter(_.print(any)) - val customResult = withPrintWriter { printWriter => any.print(printWriter) } + val standardResult = StringUtils.viaPrintWriter(_.print(any)) + val customResult = StringUtils.viaPrintWriter { printWriter => any.print(printWriter) } println(standardResult) println(customResult) @@ -51,7 +40,7 @@ class TestPrintUtils extends Test { val end = ">" def test(any: Any, expectedResult: String): Unit = { - val customResult = withPrintWriter { printWriter => any.print(printWriter, start, sep, end) } + val customResult = StringUtils.viaPrintWriter { printWriter => any.print(printWriter, start, sep, end) } println(customResult) customResult should be (expectedResult) diff --git a/main/src/test/scala/org/clulab/utils/TestSerializer.scala b/main/src/test/scala/org/clulab/utils/TestSerializer.scala index ab26e0482..6372e25fc 100644 --- a/main/src/test/scala/org/clulab/utils/TestSerializer.scala +++ b/main/src/test/scala/org/clulab/utils/TestSerializer.scala @@ -1,6 +1,7 @@ package org.clulab.utils import java.io.PrintWriter +import scala.util.Using class TestSerializer extends Test { @@ -9,8 +10,10 @@ class TestSerializer extends Test { it should "not close a null resource" in { val printWriter: PrintWriter = null - Serializer.using(printWriter) { printWriter => - println(printWriter) + assertThrows[NullPointerException] { + Using.resource(printWriter) { printWriter => + println(printWriter) + } } } } diff --git a/openie/src/main/scala/org/clulab/openie/ResourceUtils.scala b/openie/src/main/scala/org/clulab/openie/ResourceUtils.scala index 90a7728cb..655c9b6b3 100644 --- a/openie/src/main/scala/org/clulab/openie/ResourceUtils.scala +++ b/openie/src/main/scala/org/clulab/openie/ResourceUtils.scala @@ -1,7 +1,7 @@ package org.clulab.openie import java.io.InputStream - +import scala.util.Using object ResourceUtils { @@ -13,10 +13,8 @@ object ResourceUtils { } def readResource(path: String): String = { - val stream = streamFromResource(path) - val source = scala.io.Source.fromInputStream(stream) - val data = source.mkString - source.close() - data + Using.resource(scala.io.Source.fromInputStream(streamFromResource(path))) { source => + source.mkString + } } } diff --git a/project/build.properties b/project/build.properties index 478a7eaa8..11956d958 100644 --- a/project/build.properties +++ b/project/build.properties @@ -1,4 +1,4 @@ -# Version 1.8.x will cause problems when combined with the play plug-in used for the webapp! +# Version 1.7.2+ will cause problems when combined with the play plug-in used for the webapp! # [error] * org.scala-lang.modules:scala-xml_2.12:2.1.0 (early-semver) is selected over {1.2.0, 1.1.1} # [error] +- org.scala-lang:scala-compiler:2.12.17 (depends on 2.1.0) # [error] +- com.typesafe.sbt:sbt-native-packager:1.5.2 (scalaVersion=2.12, sbtVersion=1.0) (depends on 1.1.1) diff --git a/webapp/app/org/clulab/processors/webapp/controllers/HomeController.scala b/webapp/app/org/clulab/processors/webapp/controllers/HomeController.scala index de5099d08..9f4691529 100644 --- a/webapp/app/org/clulab/processors/webapp/controllers/HomeController.scala +++ b/webapp/app/org/clulab/processors/webapp/controllers/HomeController.scala @@ -2,7 +2,7 @@ package org.clulab.processors.webapp.controllers import org.clulab.odin.{CrossSentenceMention, EventMention, ExtractorEngine, Mention, RelationMention, TextBoundMention} import org.clulab.processors.Processor -import org.clulab.processors.clu.CluProcessor +import org.clulab.processors.clu.BalaurProcessor import org.clulab.processors.webapp.serialization.WebSerializer import org.clulab.sequences.LexiconNER import org.clulab.utils.{FileUtils, Unordered} @@ -33,7 +33,7 @@ class HomeController @Inject()(cc: ControllerComponents) extends AbstractControl val kbs = customLexiconNerConfigs.map(_.kb) val caseInsensitiveMatchings = customLexiconNerConfigs.map(_.caseInsensitiveMatching) val customLexiconNer = LexiconNER(kbs, caseInsensitiveMatchings, None) - val processor = new CluProcessor(optionalNER = Some(customLexiconNer)) + val processor = new BalaurProcessor(optionalNER = Some(customLexiconNer)) processor } @@ -126,7 +126,7 @@ class HomeController @Inject()(cc: ControllerComponents) extends AbstractControl Ok(views.html.index()) } - def parseText(text: String): Action[AnyContent] = Action { + def parseText(text: String): Action[AnyContent] = Action { implicit request: Request[AnyContent] => println("Text:") println(text) println() diff --git a/webapp/app/org/clulab/processors/webapp/serialization/MentionsObj.scala b/webapp/app/org/clulab/processors/webapp/serialization/MentionsObj.scala index 8c0b80cb0..2e3747870 100644 --- a/webapp/app/org/clulab/processors/webapp/serialization/MentionsObj.scala +++ b/webapp/app/org/clulab/processors/webapp/serialization/MentionsObj.scala @@ -39,7 +39,7 @@ class MentionsObj(mentions: Seq[Mention]) { def getTd(field: String, text: String): String = s""" |$leftTdHeader - | ${xml.Utility.escape(field)}:  + | ${xml.Utility.escape(field)}: |$tdSeparator | ${xml.Utility.escape(text)} |$tdTrailer diff --git a/webapp/app/org/clulab/processors/webapp/serialization/ParseObj.scala b/webapp/app/org/clulab/processors/webapp/serialization/ParseObj.scala index d35e05961..cd80abfc7 100644 --- a/webapp/app/org/clulab/processors/webapp/serialization/ParseObj.scala +++ b/webapp/app/org/clulab/processors/webapp/serialization/ParseObj.scala @@ -7,7 +7,12 @@ class ParseObj(doc: Document) { def mkParseObj(sentence: Sentence, sb: StringBuilder): Unit = { - def getTd(text: String): String = "" + xml.Utility.escape(text) + "" + def getTd(text: String, right: Boolean = false): String = { + val head = if (right) """""" else "" + val tail = "" + + head + xml.Utility.escape(text) + tail + } def getTdAtOptString(option: Option[Array[String]], n: Int): String = { val text = @@ -19,7 +24,7 @@ class ParseObj(doc: Document) { def getTdAtString(values: Array[String], n: Int): String = getTd(values(n)) - def getTdAtInt(values: Array[Int], n: Int): String = getTd(values(n).toString) + def getTdAtInt(values: Array[Int], n: Int): String = getTd(values(n).toString, true) def edgesToString(to: Int): String = { val edges = sentence.dependencies.get.incomingEdges(to) @@ -30,6 +35,7 @@ class ParseObj(doc: Document) { sentence.words.indices.foreach { i => sb .append("") + .append(s"""$i""") .append(getTdAtString(sentence.raw, i)) .append(getTdAtInt(sentence.startOffsets, i)) .append(getTdAtInt(sentence.endOffsets, i)) @@ -39,7 +45,6 @@ class ParseObj(doc: Document) { .append(getTdAtOptString(sentence.entities, i)) .append(getTdAtOptString(sentence.norms, i)) .append(getTdAtOptString(sentence.chunks, i)) - .append(getTdAtString(sentence.raw, i)) .append(getTd(edgesToString(i))) .append("") } @@ -49,7 +54,8 @@ class ParseObj(doc: Document) { val header = """ | | - | + | + | | | | @@ -58,7 +64,6 @@ class ParseObj(doc: Document) { | | | - | | | |""".stripMargin diff --git a/webapp/build.sbt b/webapp/build.sbt index 218599a0a..7b3e8cfad 100644 --- a/webapp/build.sbt +++ b/webapp/build.sbt @@ -5,7 +5,10 @@ libraryDependencies ++= Seq( // Versions were last checked 2023 Jan 31. guice, // Newer than 4.0.3 does not work for Scala 2.11. There is no Scala 3 version. - "org.scalatestplus.play" %% "scalatestplus-play" % "4.0.3" % Test // up to 5.1.0 + // See https://github.com/playframework/scalatestplus-play#releases. + // For play 2.8.19, need scalatestplus-play 5.1.0 and Scalatest 3.1.x. + // So, if we test, then we rule out Scala 2.11. + "org.scalatestplus.play" %% "scalatestplus-play" % "5.1.0" % Test // up to 5.1.0 ) // In general, we do not want to include routes or application.conf in diff --git a/webapp/public/stylesheets/main.css b/webapp/public/stylesheets/main.css index cc6a6db3a..926ea584f 100644 --- a/webapp/public/stylesheets/main.css +++ b/webapp/public/stylesheets/main.css @@ -12,6 +12,10 @@ table, th,td { border: 1px solid black; font-size: inherit; } +th, td { + padding-left: 0.5em; + padding-right: 0.5em; +} h1 { font-size: 150%; diff --git a/webapp/test/controllers/HomeControllerSpec.scala b/webapp/test/controllers/HomeControllerSpec.scala index c6f8d80bc..6867e256c 100644 --- a/webapp/test/controllers/HomeControllerSpec.scala +++ b/webapp/test/controllers/HomeControllerSpec.scala @@ -1,11 +1,11 @@ package controllers -import org.clulab.wm.eidoscommon.utils.Resourcer +import org.clulab.processors.webapp.controllers.HomeController import org.scalatestplus.play._ import org.scalatestplus.play.guice._ +import play.api.libs.json._ import play.api.test._ import play.api.test.Helpers._ -import play.api.libs.json._ /** * Add your spec here. @@ -14,88 +14,30 @@ import play.api.libs.json._ * For more information, see https://www.playframework.com/documentation/latest/ScalaTestingWithScalaTest */ class HomeControllerSpec extends PlaySpec with GuiceOneAppPerTest with Injecting { + val homeContent = "processors visualizer" + val fakeRequest = FakeRequest(GET, "/") - "HomeController GET" should { - - "render the index page from a new instance of controller" in { - val controller = new HomeController(stubControllerComponents()) - val home = controller.index().apply(FakeRequest(GET, "/")) - - status(home) mustBe OK - contentType(home) mustBe Some("text/html") - contentAsString(home) must include ("World Modelers Visualizer") - } + "HomeController GET" must { "render the index page from the application" in { val controller = inject[HomeController] - val home = controller.index().apply(FakeRequest(GET, "/")) - - status(home) mustBe OK - contentType(home) mustBe Some("text/html") - contentAsString(home) must include ("World Modelers Visualizer") - } - - "render the index page from the router" in { - val request = FakeRequest(GET, "/") - val home = route(app, request).get - - status(home) mustBe OK - contentType(home) mustBe Some("text/html") - contentAsString(home) must include ("World Modelers Visualizer") - } - } - - "HomeController POST" should { - "accept request with text parameter and return JSON" in { - - // Note that the request fails because the JSON does not have key 'text' but instead has key 'text123' - // This is because testing an actual run requires initialization which takes too long - - val testJson = Json.parse("""{ "text123": "Drought causes regional instability." }""") - val request = FakeRequest(POST, "/process_text").withJsonBody(testJson) - val result = route(app, request).get + val response = controller.index().apply(fakeRequest) - contentAsString(result) must include ("Missing parameter [text]") + status(response) mustBe OK + contentType(response) mustBe Some("text/html") + contentAsString(response) must include(homeContent) } - "be able to reground" in { - val name = "test" - // This was simply chosen because it is the smallest. - val ontologyYaml = Resourcer.getText("/org/clulab/wm/eidos/english/ontologies/un_properties.yml") - val texts = Array( - "Rainfall in the summer causes good crop yields in the fall.", - "This is another text that should be grounded." - ) - val filter = true - val topk = 5 - val isAlreadyCanonicalized = false - val regroundRequest = JsObject { Map( - "name" -> JsString(name), - "ontologyYaml" -> JsString(ontologyYaml), - "texts" -> JsArray(texts.map(JsString)), - "filter" -> JsBoolean(filter), - "topk" -> JsNumber(topk), - "isAlreadyCanonicalized" -> JsBoolean(isAlreadyCanonicalized) - ) } - val request = FakeRequest(POST, "/reground").withJsonBody(regroundRequest) - val regroundResponse = contentAsJson(route(app, request).get) - - val outerJsArray = regroundResponse.as[JsArray] - outerJsArray.value.size must be (texts.length) - - outerJsArray.value.foreach { jsValue: JsValue => - val innerJsArray = jsValue.as[JsArray] - innerJsArray.value.size must be (topk) - - innerJsArray.value.foreach { jsValue => - val jsObject = jsValue.as[JsObject] - val grounding = (jsObject \ "grounding").as[String] - val score = (jsObject \ "score").as[Double] + "render the parse from the application" in { + val text = "John eats cake." + val controller = inject[HomeController] + val json = Json.parse(s"""{ "text": "$text" }""") + val request = FakeRequest(GET, "/parseText").withJsonBody(json) + val response = controller.parseText(text).apply(request) - grounding.nonEmpty mustBe (true) - score > 0 mustBe (true) - } - } + status(response) mustBe OK + contentType(response) mustBe Some("application/json") + contentAsString(response) must include(text) } } }
TextIndexRawStartEndWordEntitiesNormsChunksRawDependencies