-
Notifications
You must be signed in to change notification settings - Fork 170
Expand file tree
/
Copy pathnewparse.cpp
More file actions
executable file
·124 lines (116 loc) · 3.64 KB
/
newparse.cpp
File metadata and controls
executable file
·124 lines (116 loc) · 3.64 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
// Read a file and parse each line according to the format file.
// Output integer matrices with lookup maps in either ascii text
// or matlab binary form.
#include <stdlib.h>
#include <stdio.h>
#include <time.h>
#include "utils.h"
#include "gzstream.h"
ivector wcount;
ivector tokens;
ivector paragraphids;
ivector sentenceids;
unhash unh;
strhash htab;
extern "C" {
extern int yylex(void);
extern FILE* yyin;
int numlines=0;
int doparagraphids=0;
int paragraphid=0;
int sentenceid=0;
}
int checkword(char *str) {
paragraphids.push_back(paragraphid);
sentenceids.push_back(sentenceid);
return checkword(str, htab, wcount, tokens, unh);
}
void addtok(int tok) {
addtok(tok, tokens);
}
const char usage[] =
"\nParser for specialized input files. Arguments are\n"
" -i <infile> input file[s] to read\n"
" -o <outd> output files will be written to <outd><infile>.imat[.gz]\n"
" -d <dictfile> dictionary will be written to <dictfile>.sbmat[.gz]\n"
" and word counts will be written to <dictfile>.imat[.gz]\n"
" -s N set buffer size to N.\n"
" -c produce compressed (gzipped) output files.\n\n"
;
int main(int argc, char ** argv) {
int pos, iarg=1, membuf=1048576;
char *here;
char *ifname = NULL;
string odname="", dictname = "", suffix = "";
while (iarg < argc) {
if (strncmp(argv[iarg], "-i", 2) == 0) {
ifname = argv[++iarg];
} else if (strncmp(argv[iarg], "-o", 2) == 0) {
odname = argv[++iarg];
} else if (strncmp(argv[iarg], "-d", 2) == 0) {
dictname = argv[++iarg];
} else if (strncmp(argv[iarg], "-s", 2) == 0) {
membuf = strtol(argv[++iarg],NULL,10);
} else if (strncmp(argv[iarg], "-c", 2) == 0) {
suffix=".gz";
} else if (strncmp(argv[iarg], "-?", 2) == 0) {
printf("%s", usage);
return 1;
} else if (strncmp(argv[iarg], "-h", 2) == 0) {
printf("%s", usage);
return 1;
} else {
cout << "Unknown option " << argv[iarg] << endl;
exit(1);
}
iarg++;
}
if (dictname.size() == 0) dictname = odname+"dict";
here = strtok(ifname, " ,");
while (here != NULL) {
if (strstr(here, ".gz") - here == strlen(here) - 3) {
#if defined __CYGWIN__ || ! defined __GNUC__
printf("cant use compressed files in cygwin\n");
exit(1);
#else
yyin = popen( (string("gunzip -c ")+here).c_str(), "r" );
#endif
} else {
yyin = fopen( here, "r" );
}
fprintf(stderr, "\nScanning %s\n", here);
fflush(stderr);
yylex();
if (strstr(here, ".gz") - here == strlen(here) - 3) {
#if ! defined __CYGWIN__ && defined __GNUC__
pclose(yyin);
#endif
} else {
fclose(yyin);
}
fprintf(stderr, "\r%05d lines", numlines);
fflush(stderr);
string rname = here;
if (strstr(here, ".gz") - here == strlen(here) - 3) {
rname = rname.substr(0, strlen(here) - 3);
}
pos = rname.rfind('/');
if (pos == string::npos) pos = rname.rfind('\\');
if (pos != string::npos) rname = rname.substr(pos+1, rname.size());
if (doparagraphids) {
writeIntVec3Cols(paragraphids, sentenceids, tokens, odname+rname+".imat"+suffix, membuf);
} else {
writeIntVec(tokens, odname+rname+".imat"+suffix, membuf);
}
tokens.clear();
sentenceids.clear();
paragraphids.clear();
numlines = 0;
sentenceid = 0;
paragraphid = 0;
here = strtok(NULL, " ,");
}
fprintf(stderr, "\nWriting Dictionary\n");
writeIntVec(wcount, dictname+".imat"+suffix, membuf);
writeSBVecs(unh, dictname+".sbmat"+suffix, membuf);
}