Skip to content

Commit 6fcc22e

Browse files
authored
Merge pull request #30 from PathwayCommons/suhyma
Finalizing data-search script & cron job
2 parents a6900d1 + 05065f9 commit 6fcc22e

6 files changed

Lines changed: 119 additions & 76 deletions

File tree

.vscode/launch.json

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,19 @@
44
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
55
"version": "0.2.0",
66
"configurations": [
7+
{
8+
"name": "Test",
9+
"request": "launch",
10+
"runtimeArgs": [
11+
"run-script",
12+
"test"
13+
],
14+
"runtimeExecutable": "npm",
15+
"skipFiles": [
16+
"<node_internals>/**"
17+
],
18+
"type": "node"
19+
},
720
{
821
"name": "NPM watch",
922
"runtimeVersion": "16.17.0",

example-data/data-config.json

Lines changed: 9 additions & 0 deletions
Large diffs are not rendered by default.

package-lock.json

Lines changed: 3 additions & 4 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

package.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727
"fix": "eslint ./src --fix",
2828
"lint": "eslint ./src",
2929
"clean": "rimraf dist src/dashboard/build",
30-
"data": "echo \"Hello World!\""
30+
"data": "node src/data-search.js"
3131
},
3232
"author": "Pathway Commons",
3333
"license": "MIT",
@@ -55,6 +55,7 @@
5555
"date-fns": "^2.29.3",
5656
"eventemitter3": "^4.0.7",
5757
"get-stdin": "^9.0.0",
58+
"lodash": "^4.17.21",
5859
"minisearch": "^5.0.0",
5960
"node-fetch": "^3.2.6",
6061
"preact": "^10.11.0",

src/cli.js

Lines changed: 46 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -12,29 +12,45 @@ import { prettyArticles } from './pretty.js';
1212
const readFile = promisify(fs.readFile);
1313
const formatJSON = obj => JSON.stringify(obj, null, 2);
1414
const printFormattedJSON = obj => console.log(formatJSON(obj));
15-
const writeFormattedJSON = async (obj, file) => await writeFile(file, formatJSON(obj));
15+
export const writeFormattedJSON = async (obj, file) => await writeFile(file, formatJSON(obj));
1616
const writeText = async (text, file) => await writeFile(file, text);
1717
const printText = text => console.log(text);
1818
const getPrettyText = (articles, queryString, options) => prettyArticles(articles, queryString, options);
1919

2020
export async function search (queryString, options) {
21-
try {
22-
const searcher = new Search();
21+
const searcher = new Search();
2322

23+
if (options.array) {
24+
const articles = options.array;
25+
await searcher.articles(articles);
26+
} else {
2427
const articles = await getInput(options);
25-
2628
await searcher.articles(articles);
29+
}
2730

28-
const res = await searcher.search(queryString, {
29-
combineWith: options.strict ? 'AND' : 'OR'
30-
});
31+
const res = await searcher.search(queryString, {
32+
combineWith: options.strict ? 'AND' : 'OR'
33+
});
3134

32-
await sendOutput(res, options, queryString);
35+
const source = options.source ?? 'biorxiv';
3336

34-
return res;
35-
} catch (err) {
36-
console.error(`Error in search: ${err}`);
37-
throw err;
37+
if (source === 'biorxiv' || source === 'medrxiv') {
38+
try {
39+
const formattedRes = await formatData(res);
40+
await sendOutput(formattedRes, options, queryString);
41+
return formattedRes;
42+
} catch (err) {
43+
console.error(`Error in search: ${err}`);
44+
throw err;
45+
}
46+
} else {
47+
try {
48+
await sendOutput(res, options, queryString);
49+
return res;
50+
} catch (err) {
51+
console.error(`Error in search: ${err}`);
52+
throw err;
53+
}
3854
}
3955
}
4056

@@ -86,6 +102,24 @@ async function getInput (options) {
86102
}
87103
}
88104

105+
async function formatData (dataArray) {
106+
try {
107+
const formattedData = dataArray.map(article => ({
108+
paperId: article.doi,
109+
doi: article.doi,
110+
title: article.title,
111+
journal: article.server,
112+
date: article.date,
113+
brief: null,
114+
authors: article.authors
115+
}));
116+
return formattedData;
117+
} catch (err) {
118+
console.error(`Error in formatting data: ${err}`);
119+
throw err;
120+
}
121+
}
122+
89123
async function main () { // eslint-disable-line no-unused-vars
90124
(program
91125
.name('hyper-recent')

src/data-search.js

Lines changed: 46 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -1,61 +1,48 @@
11
#! /usr/bin/env node
2-
3-
// here we will convert the search.sh script into JS
2+
import _ from 'lodash';
3+
import fs from 'fs';
44
import { format, sub } from 'date-fns';
5-
import fs from 'fs/promises';
6-
import { download, search, sendOutput } from './cli.js';
7-
8-
const CATEGORY_ID = 'alzheimers-disease';
9-
const DATA_DIRECTORY = 'example-data';
10-
11-
const MEDRXIV_SOURCE = 'medrxiv';
12-
const BIORXIV_SOURCE = 'biorxiv';
13-
14-
const now = new Date();
15-
const startOffset = { days: 1 };
16-
const START_DATE = format(sub(now, startOffset), 'yyyy-MM-dd');
17-
const END_DATE = format(now, 'yyyy-MM-dd');
18-
19-
const BIORXIV_FILE = `${DATA_DIRECTORY}/${END_DATE}_${BIORXIV_SOURCE}.json`;
20-
const MEDRXIV_FILE = `${DATA_DIRECTORY}/${END_DATE}_${MEDRXIV_SOURCE}.json`;
21-
const COMBINED_FILE = `${DATA_DIRECTORY}/${END_DATE}.json`;
22-
const OUTPUT_FILE = `${DATA_DIRECTORY}/${CATEGORY_ID}.json`;
23-
24-
// Getting all latest articles from BiorXiv
25-
console.log(`Fetching from ${BIORXIV_SOURCE} between ${START_DATE} and ${END_DATE}`);
26-
fs.open(BIORXIV_FILE, 'w');
27-
const bioOptions = {
28-
source: BIORXIV_SOURCE,
29-
output: BIORXIV_FILE
30-
};
31-
const bioData = await download(START_DATE, END_DATE, bioOptions);
32-
33-
// Getting all latest articles from MedrXiv
34-
console.log(`Fetching from ${MEDRXIV_SOURCE} between ${START_DATE} and ${END_DATE}`);
35-
fs.open(MEDRXIV_FILE, 'w');
36-
const medOptions = {
37-
source: MEDRXIV_SOURCE,
38-
output: MEDRXIV_FILE
39-
};
40-
const medData = await download(START_DATE, END_DATE, medOptions);
41-
42-
// Creating a JSON with all the results, both sources combined
43-
console.log('Combining results...');
44-
fs.open(COMBINED_FILE, 'w');
45-
const combinedData = bioData.concat(medData);
46-
const combinedOptions = {
47-
output: COMBINED_FILE
48-
};
49-
await sendOutput(combinedData, combinedOptions);
50-
51-
// Search for the QUERY keyword in all the downloaded articles & compile the related articles
52-
const QUERY = 'alzheimer';
53-
fs.open(OUTPUT_FILE, 'w');
54-
const outputOptions = {
55-
input: COMBINED_FILE,
56-
output: OUTPUT_FILE
57-
};
58-
console.log(`Searching for ${QUERY}`);
59-
const searchHits = await search(QUERY, outputOptions);
60-
const numSearchHits = searchHits.length;
61-
console.log(`Found ${numSearchHits} hits`);
5+
import { download } from './download.js';
6+
import { Search } from './search.js';
7+
import { writeFormattedJSON } from './cli.js';
8+
9+
/**
10+
* Download preprint data from BiorXiv and MedrXiv servers and perform search for preprints in each topic.
11+
* @returns {collection}, a JSON array of search results for each set topic.
12+
*/
13+
export async function getData () {
14+
// Set dates for past month
15+
const now = new Date();
16+
const startOffset = { months: 1 };
17+
const start = format(sub(now, startOffset), 'yyyy-MM-dd');
18+
const end = format(now, 'yyyy-MM-dd');
19+
20+
// Reading config file for list of topics
21+
const config = JSON.parse(fs.readFileSync('example-data/data-config.json'));
22+
23+
// Download all recent papers & combine the arrays
24+
const data = await Promise.all([
25+
download('biorxiv', start, end),
26+
download('medrxiv', start, end)
27+
]);
28+
29+
const articles = _.flatten(data);
30+
31+
// Search using list of topic objects from config
32+
const searcher = new Search();
33+
34+
await searcher.articles(articles);
35+
const doSearches = async config => {
36+
const { keywords } = config;
37+
const papers = await searcher.search(keywords, {
38+
combineWith: 'AND'
39+
});
40+
return _.assign({}, config, { papers });
41+
};
42+
const collection = await Promise.all(config.map(doSearches));
43+
44+
// Output all search result papers into data.json
45+
await writeFormattedJSON(collection, 'example-data/data.json');
46+
}
47+
48+
getData();

0 commit comments

Comments
 (0)