Skip to content

Commit f1fcd97

Browse files
committed
add gzip support for in/out of SortBED
Add Gzip support for the Sort BED tool. #91 CDTUtilities - updated to support gzipped input CDT files - decorate with Javadocs SortBEDWindow - update file selection (both BED & CDT) to include gzipped extensions of *.bed and *.cdt - update to support gzipping output files with "output GZIP" checkbox - update default filename to use input BED filename and update with ".gz" according to the gzip checkbox - adjust margins so components aren't up agains the edge oft the window frame - switch "Convert" to "Execute" in component naming and display text - decorate with JavaDocs SortBED - add support for gzipped inputs and efficient file parsing with BufferedReader for both .bed and .cdt inputs - add support for gzipping outputs with gzOutput boolean to method signature - decorate with JavaDocs SortBEDCLI - add gzip flag and update script method call - remove input extension restrictions on CLI
1 parent e895d0b commit f1fcd97

4 files changed

Lines changed: 204 additions & 58 deletions

File tree

src/cli/Coordinate_Manipulation/BED_Manipulation/SortBEDCLI.java

Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,8 @@ public class SortBEDCLI implements Callable<Integer> {
3535
private String outputBasename = null;
3636
@Option(names = {"-c", "--center"}, description = "sort by center on the input size of expansion in bins (default=100)")
3737
private int center = -999;
38+
@Option(names = {"-z", "--gzip"}, description = "gzip output (default=false)")
39+
private boolean gzOutput = false;
3840
@Option(names = {"-x", "--index"}, description = "sort by index from the specified start to the specified stop (0-indexed and half-open interval)",
3941
arity = "2")
4042
private int[] index = {-999, -999};
@@ -57,7 +59,7 @@ public Integer call() throws Exception {
5759
index[1] = (CDT_SIZE / 2) + (center / 2);
5860
}
5961

60-
SortBED.sortBEDbyCDT(outputBasename, bedFile, cdtFile, index[0], index[1]);
62+
SortBED.sortBEDbyCDT(outputBasename, bedFile, cdtFile, index[0], index[1], gzOutput);
6163

6264
System.err.println("Sort Complete");
6365
return(0);
@@ -74,13 +76,6 @@ private String validateInput() throws IOException {
7476
r += "(!)CDT file does not exist: " + cdtFile.getName() + "\n";
7577
}
7678
if(!"".equals(r)){ return(r); }
77-
//check input extensions
78-
if(!"bed".equals(ExtensionFileFilter.getExtension(bedFile))){
79-
r += "(!)Is this a BED file? Check extension: " + bedFile.getName() + "\n";
80-
}
81-
if(!"cdt".equals(ExtensionFileFilter.getExtension(cdtFile))){
82-
r += "(!)Is this a CDT file? Check extension: " + cdtFile.getName() + "\n";
83-
}
8479
// validate CDT as file, with consistent row size, and save row_size value
8580
try {
8681
CDTUtilities cdt_obj = new CDTUtilities();
Lines changed: 65 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,54 @@
11
package scripts.Coordinate_Manipulation.BED_Manipulation;
22

3+
import java.io.BufferedOutputStream;
4+
import java.io.BufferedReader;
35
import java.io.File;
6+
import java.io.FileInputStream;
7+
import java.io.FileOutputStream;
48
import java.io.IOException;
9+
import java.io.InputStreamReader;
510
import java.io.PrintStream;
611
import java.util.ArrayList;
712
import java.util.Collections;
813
import java.util.HashMap;
9-
import java.util.Scanner;
14+
import java.util.zip.GZIPInputStream;
15+
import java.util.zip.GZIPOutputStream;
1016

1117
import objects.CoordinateObjects.BEDCoord;
18+
import util.GZipUtilities;
1219

20+
/**
21+
* This class contains scripts for sorting coordinate intervals (BED/GFF) by the tag counts of a CDT matrix file.
22+
*
23+
* @author William KM Lai
24+
*
25+
*/
1326
public class SortBED {
14-
public static void sortBEDbyCDT(String outname, File bed, File cdt, int START_INDEX, int STOP_INDEX)
15-
throws IOException {
27+
/**
28+
* Sort a BED file by the values from a CDT matrix file. Includes Gzip support.
29+
*
30+
* @param outbase Filepath basename (without ext) to save the sorted BED (<basename>.bed) and sorted CDT (<basename>.cdt) files.
31+
* @param bed input BED file to sort
32+
* @param cdt input CDT file with values to sort by
33+
* @param START_INDEX the start column to consider when summing values to sort
34+
* @param STOP_INDEX
35+
* @param gzOutput if true, the output files will be gzipped.
36+
* @throws IOException
37+
*/
38+
public static void sortBEDbyCDT(String outbase, File bed, File cdt, int START_INDEX, int STOP_INDEX, boolean gzOutput ) throws IOException {
1639
ArrayList<BEDCoord> SORT = new ArrayList<BEDCoord>();
1740
HashMap<String, String> CDTFile = new HashMap<String, String>();
1841
String CDTHeader = "";
19-
// Parse CDT File first
20-
Scanner scan = new Scanner(cdt);
21-
while (scan.hasNextLine()) {
22-
String line = scan.nextLine();
42+
// Check if file is gzipped and instantiate appropriate BufferedReader
43+
BufferedReader br;
44+
if(GZipUtilities.isGZipped(cdt)) {
45+
br = new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(cdt)), "UTF-8"));
46+
} else {
47+
br = new BufferedReader(new InputStreamReader(new FileInputStream(cdt), "UTF-8"));
48+
}
49+
// Initialize line variable to loop through
50+
String line = br.readLine();
51+
while (line != null) {
2352
String[] ID = line.split("\t");
2453
if (!ID[0].contains("YORF") && !ID[0].contains("NAME")) {
2554
double count = 0;
@@ -31,14 +60,20 @@ public static void sortBEDbyCDT(String outname, File bed, File cdt, int START_IN
3160
} else {
3261
CDTHeader = line;
3362
}
63+
line = br.readLine();
3464
}
35-
scan.close();
65+
br.close();
3666
// Sort by score
3767
Collections.sort(SORT, BEDCoord.ScoreComparator);
3868

69+
PrintStream OUT;
70+
// Initialize output writer
71+
if (gzOutput) {
72+
OUT = new PrintStream(new BufferedOutputStream(new GZIPOutputStream(new FileOutputStream(outbase + ".cdt.gz"))));
73+
} else {
74+
OUT = new PrintStream(new BufferedOutputStream(new FileOutputStream(outbase + ".cdt")));
75+
}
3976
// Output sorted CDT File
40-
String newCDT = outname + ".cdt";
41-
PrintStream OUT = new PrintStream(newCDT);
4277
OUT.println(CDTHeader);
4378
for (int x = 0; x < SORT.size(); x++) {
4479
OUT.println(CDTFile.get(SORT.get(x).getName()));
@@ -48,22 +83,34 @@ public static void sortBEDbyCDT(String outname, File bed, File cdt, int START_IN
4883

4984
// Match to bed file after
5085
HashMap<String, String> BEDFile = new HashMap<String, String>();
51-
scan = new Scanner(bed);
52-
while (scan.hasNextLine()) {
53-
String line = scan.nextLine();
86+
87+
// Check if file is gzipped and instantiate appropriate BufferedReader
88+
if(GZipUtilities.isGZipped(bed)) {
89+
br = new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(bed)), "UTF-8"));
90+
} else {
91+
br = new BufferedReader(new InputStreamReader(new FileInputStream(bed), "UTF-8"));
92+
}
93+
// Initialize line variable to loop through
94+
line = br.readLine();
95+
while (line != null) {
5496
String ID = line.split("\t")[3];
5597
if (!ID.contains("YORF") && !ID.contains("NAME")) {
5698
BEDFile.put(ID, line);
5799
}
100+
line = br.readLine();
101+
}
102+
br.close();
103+
104+
// Initialize output writer
105+
if (gzOutput) {
106+
OUT = new PrintStream(new BufferedOutputStream(new GZIPOutputStream(new FileOutputStream(outbase + ".bed.gz"))));
107+
} else {
108+
OUT = new PrintStream(new BufferedOutputStream(new FileOutputStream(outbase + ".bed")));
58109
}
59-
scan.close();
60110
// Output sorted BED File
61-
String newBED = outname + ".bed";
62-
OUT = new PrintStream(newBED);
63111
for (int x = 0; x < SORT.size(); x++) {
64112
OUT.println(BEDFile.get(SORT.get(x).getName()));
65113
}
66114
OUT.close();
67115
}
68-
69-
}
116+
}

src/util/CDTUtilities.java

Lines changed: 76 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,31 +1,57 @@
11
package util;
22

3+
import java.io.BufferedReader;
34
import java.io.File;
5+
import java.io.FileInputStream;
46
import java.io.FileNotFoundException;
7+
import java.io.IOException;
8+
import java.io.InputStreamReader;
59
import java.util.ArrayList;
610
import java.util.Collections;
711
import java.util.HashMap;
812
import java.util.Map;
913
import java.util.Scanner;
1014
import java.util.Set;
1115
import java.util.Vector;
16+
import java.util.zip.GZIPInputStream;
1217

18+
/**
19+
* This class was created to parse and validate CDT files and counting the number of columns and is based on originally tool-specific methods.
20+
*
21+
* @author William KM Lai
22+
* @see window_interface.Coordinate_Manipulation.BED_Manipulation.SortBEDWindow
23+
* @see scripts.Coordinate_Manipulation.BED_Manipulation.SortBED
24+
* @see scripts.Coordinate_Manipulation.GFF_Manipulation.SortGFF
25+
*
26+
*/
1327
public class CDTUtilities {
1428

1529
private int SIZE;
1630
private boolean consistentSize;
1731
private String invalidMessage;
1832

19-
// This function is almost exactly copied from window/*/SortBEDWindow & scripts/*/SortBED & scripts/*/SortGFF...good practice to merge at some point.
20-
public void parseCDT(File CDT) throws FileNotFoundException {
33+
/**
34+
* Parse CDT-formatted file for consistent column sizes and a row count
35+
* @param CDT a CDT-formatted file to validate
36+
* @throws IOException
37+
*/
38+
public void parseCDT(File CDT) throws IOException {
2139
SIZE = -999;
2240
consistentSize = true;
2341
invalidMessage = "";
2442

25-
Scanner scan = new Scanner(CDT);
43+
// Check if file is gzipped and instantiate appropriate BufferedReader
44+
BufferedReader br;
45+
if(GZipUtilities.isGZipped(CDT)) {
46+
br = new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(CDT)), "UTF-8"));
47+
} else {
48+
br = new BufferedReader(new InputStreamReader(new FileInputStream(CDT), "UTF-8"));
49+
}
50+
// Initialize line variable to loop through
51+
String line = br.readLine();
2652
int currentRow = 1;
27-
while (scan.hasNextLine()) {
28-
String[] temp = scan.nextLine().split("\t");
53+
while (line != null) {
54+
String[] temp = line.split("\t");
2955
if(!temp[0].contains("YORF") && !temp[0].contains("NAME")) {
3056
int tempsize = temp.length - 2;
3157
if(SIZE == -999) { SIZE = tempsize; }
@@ -36,8 +62,9 @@ else if(SIZE != tempsize) {
3662
}
3763
currentRow++;
3864
}
65+
line = br.readLine();
3966
}
40-
scan.close();
67+
br.close();
4168
}
4269

4370
public boolean isValid(){ return consistentSize; }
@@ -63,7 +90,12 @@ public static Vector<double[]> loadCDT(File input) throws FileNotFoundException
6390
scan.close();
6491
return matrix;
6592
}
66-
93+
94+
/**
95+
* Given a 2D-array formatted as a vector of primitive array types, return the average composite.
96+
* @param CDT a Vector of primitive arrays of primitive doubles (decimal matrix)
97+
* @return an array of positional composite average values from the input matrix
98+
*/
6799
public static double[] getComposite(Vector<double[]> CDT) {
68100
double[] AVG = new double[CDT.get(0).length];
69101
double COUNT = 0;
@@ -77,7 +109,12 @@ public static double[] getComposite(Vector<double[]> CDT) {
77109
for(int x = 0; x < AVG.length; x++) { AVG[x] /= COUNT; }
78110
return AVG;
79111
}
80-
112+
113+
/**
114+
* Given a 2D-array formatted as a vector of primitive array types, return some basic statistics.
115+
* @param CDT a Vector of primitive arrays of primitive doubles (decimal matrix)
116+
* @return an ArrayList of statistics on the input matrix
117+
*/
81118
public static ArrayList<Double> getStats(Vector<double[]> CDT) {
82119
ArrayList<Double> STATS = new ArrayList<Double>();
83120
ArrayList<Double> values = new ArrayList<Double>();
@@ -127,7 +164,12 @@ public static ArrayList<Double> getStats(Vector<double[]> CDT) {
127164
STATS.add(mode);
128165
return STATS;
129166
}
130-
167+
168+
/**
169+
* Given a 2D-array formatted as a vector of primitive array types, return the non-zero maximum value.
170+
* @param CDT a Vector of primitive arrays of primitive doubles (decimal matrix)
171+
* @return the maximum value ignoring zeros of the input matrix.
172+
*/
131173
public static Double getMax(Vector<double[]> CDT) {
132174
double max = 0;
133175
for(int x = 0; x < CDT.size(); x++) {
@@ -139,7 +181,12 @@ public static Double getMax(Vector<double[]> CDT) {
139181
}
140182
return max;
141183
}
142-
184+
185+
/**
186+
* Given a 2D-array formatted as a vector of primitive array types, return the non-zero minimum value.
187+
* @param CDT a Vector of primitive arrays of primitive doubles (decimal matrix)
188+
* @return the minimum value ignoring zeros of the input matrix.
189+
*/
143190
public static Double getMin(Vector<double[]> CDT) {
144191
double min = 0;
145192
for(int x = 0; x < CDT.size(); x++) {
@@ -152,7 +199,12 @@ public static Double getMin(Vector<double[]> CDT) {
152199
}
153200
return min;
154201
}
155-
202+
203+
/**
204+
* Given a 2D-array formatted as a vector of primitive array types, return the non-zero median value.
205+
* @param CDT a Vector of primitive arrays of primitive doubles (decimal matrix)
206+
* @return the median value ignoring zeros of the input matrix
207+
*/
156208
public static Double getMedian(Vector<double[]> CDT) {
157209
ArrayList<Double> values = new ArrayList<Double>();
158210
for(int x = 0; x < CDT.size(); x++) {
@@ -164,6 +216,7 @@ public static Double getMedian(Vector<double[]> CDT) {
164216
}
165217
Collections.sort(values);
166218

219+
// Averaging two floor/ceil middle values accounts for even/odd list size
167220
double pos1 = Math.floor((values.size() - 1.0) / 2.0);
168221
double pos2 = Math.ceil((values.size() - 1.0) / 2.0);
169222
if (pos1 == pos2 ) {
@@ -172,7 +225,12 @@ public static Double getMedian(Vector<double[]> CDT) {
172225
return (values.get((int)pos1) + values.get((int)pos2)) / 2.0 ;
173226
}
174227
}
175-
228+
229+
/**
230+
* Given a 2D-array formatted as a vector of primitive array types, return the non-zero average value.
231+
* @param CDT a Vector of primitive arrays of primitive doubles (decimal matrix)
232+
* @return the average value ignoring zeros of the input matrix
233+
*/
176234
public static Double getAverage(Vector<double[]> CDT) {
177235
double average = 0;
178236
int count = 0;
@@ -187,7 +245,12 @@ public static Double getAverage(Vector<double[]> CDT) {
187245
if(count != 0) return (average / count);
188246
else return 0.0;
189247
}
190-
248+
249+
/**
250+
* Given a 2D-array formatted as a vector of primitive array types, return the non-zero mode value.
251+
* @param CDT a Vector of primitive arrays of primitive doubles (decimal matrix)
252+
* @return the mode value ignoring zeros of the input matrix
253+
*/
191254
public static Double getMode(Vector<double[]> CDT) {
192255
double mode = 0;
193256
int modecount = 0;

0 commit comments

Comments
 (0)