-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpopulate_workflow_status.pl
More file actions
941 lines (549 loc) · 35.2 KB
/
populate_workflow_status.pl
File metadata and controls
941 lines (549 loc) · 35.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
#!/usr/bin/env perl
use strict;
use warnings;
use DBI;
use JSON::PP;
use File::Basename;
use File::Spec;
# add lib sub-folder containing modules into @INC
use lib File::Spec->catdir(File::Basename::dirname(File::Spec->rel2abs($0)), 'lib');
# add modules from lib-subfolder
use AuditTable;
use ABCInfo;
use Util;
use Mappings;
use Data::Dumper;
$Data::Dumper::Sortkeys = 1;
use Encode;
binmode(STDOUT, ":utf8");
=head1 NAME populate_workflow_status.pl
=head1 SYNOPSIS
Used to load FlyBase curation status information into the Alliance ABC literature database for those types of curation that are stored in the ABC in the 'workflow_tag' table, because they are curation types which are relevant to the publication workflow (how a publication 'moves' through the various automated/manual curation processes at a MOD). Generates an output file containing a single json structure for all the data (the workflow_tag status data is a set of arrays within a 'data' object, plus there is a 'metaData' object to indicate source and the API to use to submit the data into the ABC).
=cut
=head1 USAGE
USAGE: perl populate_workflow_status.pl pg_server db_name pg_username pg_password dev|test|production
=cut
=head1 DESCRIPTION
Types of curation that are relevant to the publication workflow (and thus stored in the Alliance in 'workflow_tag') include those that are:
o important for deciding the next step in the publication workflow process (manual or automated)
o e.g. if a paper has been community curation via FTYP, there is no need for a FB curator to do first-pass ('skim') curation.
o a type of curation that must have been completed before another type of curation can occur
o e.g. a paper must have been 'thin' curated (Alliance name: 'manual_indexing') before curation of phenotypic or DO data can occur.
Three types of FB curation are mapped to the appropriate Alliance information by this script:
o community curation
o first pass curation by a biocurator ('skim' curation at FB)
o manual indexing ('thin' curation at FB)
(NB: In the Alliance, curation status for individual datatypes (Alliance name: 'topics') are stored in 'curation_status', NOT 'workflow_tag', so FB curation status info for these types of curation (e.g. phenotype, physical interactions) are not dealt with by this script, but instead by populate_topic_curation_status.pl).
Script logic:
o Uses FB 'curated_by' pubprop information to determine the curation status of the three FB curation types being mapped to workflow_tag. Sets curation status to 'done' when a file of the standard expected filename format is found for a given curation type.
o Uses the 'nocur' flag to identify papers that contain 'no genetic information'. Validates that the nocur flag is correct and then sets manual_indexing status to 'won't curate' (with 'no genetic information' curation_tag), overriding any 'done' status added in the first step above.
o Identifies papers have not yet been manually indexed, but which contain high-priority data. Sets curation status to 'curation needed' for manual indexing, with a note explaining why the paper is high priority.
o Adds publication-level internal notes to the 'note' of the appropriate workflow_tag curation type
o first filters out internal notes that are either not being submitted to the Alliance or will be submitted in a different script (e.g. attached either to a topic or a topic curation status).
o uses the internal note timestamp to identify which of the three workflow_tag curation types to add the internal note to.
o For any internal notes where the timestamp did not match any of the workflow_tag timestamps for that publication (can happen if the note was added as an edit record), add it to the manual indexing status (if that exists), or then the first-pass curation status (if that exists).
o Any internal notes that have not been matched up and added in the above steps are printed in the FB_workflow_status_data_errors.err file.
Script has three modes:
o dev mode
o single FBrf mode: asks user for FBrf number (can also use a regular expression to test multiple FBrfs in this mode).
o Data is printed to both the json output and 'plain' output files.
o makes error files (see below) to record any errors.
o test mode
o single FBrf mode: asks user for FBrf number (must be a single FBrf number).
o uses curl to try to POST data to the Alliance ABC stage server (so asks user for okta token for Alliance ABC stage server).
o Data (including a record of successful curl POST events) is printed to the 'plain' output file.
o makes error files (see below) to record any errors.
o production mode
o makes data for all relevant FBrfs in chado.
o Data is printed to the json output file.
o makes error files (see below) to record any errors.
o Output files
o json output file (FB_workflow_status_data.json) containing a single json structure for all the data (manual_indexing status data is a set of arrays within a 'data' object, plus there is a 'metaData' object to indicate source). Data is printed to this file in all modes except 'test'.
o 'plain' output file (FB_workflow_status_data.txt) to aid in debugging - prints the same data as in the json file, but with a single 'DATA:' tsv row for each FBrf+topic combination. Data is printed to this file in all modes.
o FB_workflow_status_data_errors.err - errors in mapping FlyBase data to appropriate Alliance json are printed in this file. Data is printed to this file in all modes.
o FB_workflow_status_process_errors.err - processing errors - if a curl POST fails in test mode, the failed json element and the reason for the failure are printed in this file. Expected to be empty for all other modes.
=cut
if (@ARGV != 6) {
warn "Wrong number of arguments, should be 6!\n";
warn "\n USAGE: $0 pg_server db_name pg_username pg_password dev|test|production access_token\n\n";
warn "\teg: $0 flysql24 production_chado zhou pwd dev|test|production ABCD1234\n\n";
exit;
}
my $server = shift(@ARGV);
my $db = shift(@ARGV);
my $user = shift(@ARGV);
my $pwd = shift(@ARGV);
my $ENV_STATE = shift(@ARGV);
my $access_token = shift(@ARGV);
unless ($ENV_STATE eq 'dev'|| $ENV_STATE eq 'test'|| $ENV_STATE eq 'production') {
warn "Unknown state '$ENV_STATE': must be 'dev', 'test' or 'production'\n\n";
exit;
}
my $test_FBrf = '';
# variable that specifies the appropriate path (downstream of the base URL) to use for the Alliance Literature Service API.
# Used to add an element in the metaData object of the output json file and in the curl command used when in test mode.
my $api_endpoint = 'workflow_tag';
if ($ENV_STATE eq "test") {
print STDERR "You are about to write data to the stage Alliance literature server\n";
print STDERR "Type y to continue else anything else to stop:\n";
my $continue = <STDIN>;
chomp $continue;
if (($continue eq 'y') || ($continue eq 'Y')) {
print STDERR "Processing will continue.\n";
} else {
die "Processing has been cancelled.\n";
}
}
if ($ENV_STATE eq "dev" || $ENV_STATE eq "test") {
print STDERR "FBrf to test:";
$test_FBrf = <STDIN>;
chomp $test_FBrf;
if ($ENV_STATE eq "test") {
unless ($test_FBrf =~ m/^FBrf[0-9]{7}$/) {
die "Only a single FBrf is allowed in test mode.\n";
}
}
} else {
$test_FBrf = '^FBrf[0-9]+$';
}
my $dsource = sprintf("dbi:Pg:dbname=%s;host=%s;port=5432",$db,$server);
my $dbh = DBI->connect($dsource,$user,$pwd) or die "cannot connect to $dsource\n";
my $json_encoder = JSON::PP->new()->pretty(1)->canonical(1);
my $workflow_tag_mapping = {
# community curation
'0_user' => {
'finished_status' => 'ATP:0000234', # community curation finished
'relevant_record_type' => ['user'],
},
# first pass curation
'1_skim' => {
'finished_status' => 'ATP:0000330', # first pass curation finished
'relevant_record_type' => ['skim'],
'second_pass' => '1',
},
# manual indexing
'2_manual_indexing' => {
'finished_status' => 'ATP:0000275', # manual indexing complete
'relevant_record_type' => ['thin', 'cam_full', 'gene_full', 'cam_no_suffix'], # use an array so can go through the types in this order when assigning manual indexing status
'nocur_override' => 'ATP:0000343', # won't manually index
'pubtype_filter' => {
'cam_no_suffix' => {
'review' => '1',
},
},
'high_priority_override' => 'ATP:0000274', # manual indexing needed
'second_pass' => '1',
},
};
# structure of the required json for each workflow_tag element
#{
# "date_created": $timestamp,
# "date_updated": $timestamp,
# "created_by": $curator,
# "updated_by": $curator,
# "mod_abbreviation": "FB",
# "reference_curie": FB:$FBrf,
# "workflow_tag_id": $ATP, # this is the ATP term that describes the status of the particular type of curation
# "curation_tag": # this is an ATP ID is for the 'controlled_note' - most are negative
# "note": # this is a free text note
#}
# open output and error logging files
open my $json_output_file, '>', "FB_workflow_status_data.json"
or die "Can't open json output file ($!)\n";
binmode($json_output_file, ":utf8");
open my $data_error_file, '>', "FB_workflow_status_data_errors.err"
or die "Can't open data error logging file ($!)\n";
binmode($data_error_file, ":utf8");
open my $process_error_file, '>', "FB_workflow_status_process_errors.err"
or die "Can't open processing error logging file ($!)\n";
binmode($process_error_file, ":utf8");
open my $plain_output_file, '>', "FB_workflow_status_data.txt"
or die "Can't open plain output file ($!)\n";
binmode($plain_output_file, ":utf8");
print $plain_output_file "##Starting processing: " . (scalar localtime) . "\n";
my $all_curation_record_data = &get_all_currec_data($dbh);
## get relevant data for deciding curation status for the various type of workflow_tag curation
my $fb_data = {};
foreach my $workflow_type (sort keys %{$workflow_tag_mapping}) {
foreach my $relevant_record_type (@{$workflow_tag_mapping->{$workflow_type}->{'relevant_record_type'}}) {
($fb_data->{"$relevant_record_type"}->{"by_timestamp"}, $fb_data->{"$relevant_record_type"}->{"by_curator"}) = &get_relevant_currec_for_datatype($dbh,$relevant_record_type);
}
}
#print Dumper ($fb_data);
## get data to assign 'no genetic data' curation tag and associated 'won't curate' workflow_tag term
my $nocur_flags = &get_matching_pubprop_value_with_timestamps($dbh,'cam_flag','^nocur|nocur_abs$');
## get publications that *do* have links to genetic objects (used for validation)
my $has_genetic_data = &pub_has_curated_data($dbh, 'genetic_data');
## get diseaseHP flags so can assign manual indexing 'needs curation' where appropriate
my $high_priority_flags = {};
$high_priority_flags->{'diseaseHP_dis_flag'} = &get_matching_pubprop_value_with_timestamps($dbh,'dis_flag','^diseaseHP$');
$high_priority_flags->{'diseaseHP_harv_flag'} = &get_matching_pubprop_value_with_timestamps($dbh,'harv_flag','^diseaseHP$');
$high_priority_flags->{'wt_exp_needs_curation'} = &get_matching_pubprop_value_with_timestamps($dbh,'harv_flag','^wt_exp::Needs cam curation$');
my $high_priority_mapping = {
'diseaseHP_dis_flag' => 'diseaseHP',
'diseaseHP_harv_flag' => 'diseaseHP',
'wt_exp_needs_curation' => 'wt_exp::Needs cam curation',
};
#print Dumper ($high_priority_flags);
my $additional_filters = [
# Use simple regex to remove *all* 'Dataset:' lines - will be converted to a topic and/or associated free text note in another script
# The commented out lines are the regexes needed to identify the 'Dataset: pheno' lines for the topic scripts
#'^Dataset: pheno\.?$',
#'^Dataset: pheno\. ?-?[a-z]{1,} ?[0-9]{6}(\.)?$',
#'^Dataset: pheno\. [0-9]{6}[a-z]{1,}\.$',
'^(D|d)ataset:.+$',
# filters to remove lines that will be converted to a topic *status* and/or associated free text note in another script
'^HDM flag not applicable\.?( *[a-z]{1,}[0-9]{6})?$',
'^phen curation: only pheno_chem data in paper\.( *[a-z]{2}[0-9]{6}\.?)?$',
'^phen curation: No phenotypic data in paper\.( *[a-z]{2}[0-9]{6}\.?)?$',
'^phen_cur: CV annotations only(\. *[a-z]{2}[0-9]{6}\.?)?$',
'^phys_int not curated.+$',
'^FTA: DOcur genotype - need to check for missing drivers$',
# filters to remove lines that will be converted to a free text note attached to a topic in another script
'^The phys_int flag inferred from.+$',
'^The phys_int flag is inferred from.+$',
'^The phys_int flag was inferred.+$',
'^FTYP cell line:.+$',
# filter to remove lines that would be better converted into a note when submit curation record filename info.
'^Curation record .*? is to add the allele phendesc data originally curated in .+$',
'^Curation record .*? is to fix data for MI4 .+$',
'^Curation record .*? generated by hand to fix MI4 ticket.+$',
# filter to remove preliminary data that will not be submitted to the Alliance
'^HDM flag future.+$',
];
## get any relevant internal notes to be added to the free text note slot of a workflow_tag element - first filtering out any internal notes that do not need to be added by this script (they will instead either be converted into an ATP term corresponding to a topic or controlled note in the Alliance, or added as a free text note to a specific topic), using the regexes specified in $additional_filters.
my $all_candidate_internal_notes = &get_all_pub_internal_notes_for_tet_wf($dbh, $additional_filters);
## then remove any internal notes that come from curation records for a particular topic - these will be added as a note to the curation status of the *topic* in another script, rather than being added to the more general workflow_tag categories in this script.
# any remaining internal notes will be those either submitted under one of the workflow_tag types for this script (identified later through matching timestamp and curation record filename information) or under an 'edit' record - the latter will be added to the most appropriate workflow_tag with a note indicating it was an edit record
my @topic_record_types = ('cell_line', 'phys_int', 'DO', 'neur_exp', 'wt_exp', 'chemical', 'args', 'phen', 'humanhealth');
foreach my $topic_record_type (@topic_record_types) {
my (undef, $curation_record_data) = &get_relevant_currec_for_datatype($dbh,$topic_record_type);
foreach my $pub_id (keys %{$all_candidate_internal_notes}) {
foreach my $int_note (keys %{$all_candidate_internal_notes->{$pub_id}}) {
foreach my $timestamp (@{$all_candidate_internal_notes->{$pub_id}->{$int_note}}) {
my $int_note_details = &get_relevant_curator_from_candidate_list_using_pub_and_timestamp($all_curation_record_data, $pub_id, $timestamp);
if (defined $int_note_details && $int_note_details->{currecs} ne 'multiple curators for same timestamp') {
if (exists $curation_record_data->{$pub_id}->{"$int_note_details->{curator}"} && exists $curation_record_data->{$pub_id}->{"$int_note_details->{curator}"}->{$timestamp}->{"$int_note_details->{currecs}"}) {
delete $all_candidate_internal_notes->{$pub_id}->{$int_note};
next;
}
}
}
}
}
}
## Get list of publications: restrict to the type of pubs where it is useful to export workflow status info (same types as used in populate_topic_curation_status.pl)
my $pub_id_to_FBrf = {};
my $sql_query = sprintf("select p.uniquename, p.pub_id, cvt.name from pub p, cvterm cvt where p.is_obsolete = 'f' and p.type_id = cvt.cvterm_id and cvt.is_obsolete = '0' and cvt.name in ('paper', 'erratum', 'letter', 'note', 'teaching note', 'supplementary material', 'retraction', 'personal communication to FlyBase', 'review') and p.uniquename ~'%s'", $test_FBrf);
my $db_query= $dbh->prepare ($sql_query);
$db_query->execute or die" CAN'T GET FBrf FROM CHADO:\n$sql_query)\n";
while (my ($uniquename, $pub_id, $pub_type) = $db_query->fetchrow_array()) {
$pub_id_to_FBrf->{$pub_id}->{'FBrf'} = $uniquename;
$pub_id_to_FBrf->{$pub_id}->{'type'} = $pub_type;
}
my $workflow_status_data = {};
foreach my $pub_id (sort keys %{$pub_id_to_FBrf}) {
my $FBrf = $pub_id_to_FBrf->{$pub_id}->{'FBrf'};
my $pub_type = $pub_id_to_FBrf->{$pub_id}->{'type'};
my ($nocur_status, $nocur_timestamp, $nocur_note) = &check_and_validate_nocur($nocur_flags, $has_genetic_data, $pub_id);
foreach my $workflow_type (sort keys %{$workflow_tag_mapping}) {
foreach my $relevant_record_type (@{$workflow_tag_mapping->{$workflow_type}->{'relevant_record_type'}}) {
if (exists $workflow_tag_mapping->{$workflow_type}->{'pubtype_filter'} && exists $workflow_tag_mapping->{$workflow_type}->{'pubtype_filter'}->{$relevant_record_type}) {
unless (exists $workflow_tag_mapping->{$workflow_type}->{'pubtype_filter'}->{$relevant_record_type}->{$pub_type}) {
next;
}
}
unless (exists $workflow_status_data->{$pub_id}->{$workflow_type}) {
my $curator_details = &get_relevant_curator_from_candidate_list($fb_data->{"$relevant_record_type"}, $pub_id);
# if there is a matching record for the workflow type, store the information for submitting data to the Alliance
if (defined $curator_details) {
my $ATP = $workflow_tag_mapping->{$workflow_type}->{'finished_status'};
my $curation_tag = '';
my $note = '';
# set values based on matching record (overriden in a few edge cases below)
my $curator = "$curator_details->{curator}";
my $timestamp = "$curator_details->{timestamp}";
my $curation_records = "$curator_details->{currecs}";
my $debugging_note = '';
# for manual indexing, use any nocur information to override the workflow type (to the 'won't curate' style term)
# and add the appropriate 'no genetic data' curation_tag where appropriate
if (exists $workflow_tag_mapping->{$workflow_type}->{'nocur_override'}) {
if ($nocur_status == 1) {
$ATP = $workflow_tag_mapping->{$workflow_type}->{'nocur_override'};
$curation_tag = "ATP:0000207"; # no genetic data
$note = "$nocur_note";
unless ($timestamp eq $nocur_timestamp) {
$timestamp = "$nocur_timestamp";
$debugging_note = "timestamp mismatch: curation record info overwritten by nocur flag info";
my $nocur_details = &get_relevant_curator_from_candidate_list_using_pub_and_timestamp($all_curation_record_data, $pub_id, $timestamp);
if (defined $nocur_details) {
$curator = "$nocur_details->{curator}";
$curation_records = "$nocur_details->{currecs}";
} else {
$curator = 'FB_curator';
$curation_records = "WARNING: unable to get curator details for nocur flag";
}
}
}
}
# build reference with information for this publication+workflow type combination
my $FBrf_with_prefix="FB:".$FBrf;
$workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{date_created} = $timestamp;
$workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{date_updated} = $timestamp;
$workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{created_by} = $curator;
$workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{updated_by} = $curator;
$workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{mod_abbreviation} = "FB";
$workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{reference_curie} = $FBrf_with_prefix;
$workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{workflow_tag_id} = $ATP;
if ($curation_tag) {
$workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{curation_tag} = $curation_tag;
}
if ($note) {
$workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{note} = $note;
}
$workflow_status_data->{$pub_id}->{$workflow_type}->{debugging}->{currecs} = $curation_records;
$workflow_status_data->{$pub_id}->{$workflow_type}->{debugging}->{relevant_record_type} = $relevant_record_type;
if ($debugging_note) {
$workflow_status_data->{$pub_id}->{$workflow_type}->{debugging}->{note} = $debugging_note;
}
}
}
}
# once been through all the curation record types for each workflow_type, see if there is additional information that can be added via nocur flag
if (exists $workflow_tag_mapping->{$workflow_type}->{'nocur_override'}) {
unless (exists $workflow_status_data->{$pub_id}->{$workflow_type}) {
if ($nocur_status == 1) {
my $curator = 'FB_curator'; # default that is overriden with more specific data later where possible
my $timestamp = "$nocur_timestamp";
my $curation_records = 'WARNING: unable to get curator details for nocur flag'; # default that is overriden with more specific data later where possible
my $ATP = $workflow_tag_mapping->{$workflow_type}->{'nocur_override'};
my $curation_tag = "ATP:0000207"; # no genetic data
my $note = "$nocur_note";
my $debugging_note = '';
# get curator details for the nocur flag and use to override defaults where possible
my $nocur_details = &get_relevant_curator_from_candidate_list_using_pub_and_timestamp($all_curation_record_data, $pub_id,$timestamp);
if (defined $nocur_details) {
$curator = "$nocur_details->{curator}";
$curation_records = "$nocur_details->{currecs}";
}
# override edge cases where a curator added nocur in a user record
if ($curator eq 'Author Submission' || $curator eq 'User Submission') {
$curator = 'FB_curator';
}
# build reference with information for this publication+workflow type combination
my $FBrf_with_prefix="FB:".$FBrf;
$workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{date_created} = $timestamp;
$workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{date_updated} = $timestamp;
$workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{created_by} = $curator;
$workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{updated_by} = $curator;
$workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{mod_abbreviation} = "FB";
$workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{reference_curie} = $FBrf_with_prefix;
$workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{workflow_tag_id} = $ATP;
#
if ($curation_tag) {
$workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{curation_tag} = $curation_tag;
}
if ($note) {
$workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{note} = $note;
}
$workflow_status_data->{$pub_id}->{$workflow_type}->{debugging}->{currecs} = $curation_records;
$workflow_status_data->{$pub_id}->{$workflow_type}->{debugging}->{relevant_record_type} = 'via flag';
if ($debugging_note) {
$workflow_status_data->{$pub_id}->{$workflow_type}->{debugging}->{note} = $debugging_note;
}
}
}
}
# next, see if there are any non-curated papers that should be marked as 'need curation' as they are high-priority
if (exists $workflow_tag_mapping->{$workflow_type}->{'high_priority_override'}) {
foreach my $flag_type (sort keys %{$high_priority_flags}) {
unless (exists $workflow_status_data->{$pub_id}->{$workflow_type}) {
if (exists $high_priority_flags->{$flag_type}->{$pub_id}) {
my $ATP = $workflow_tag_mapping->{$workflow_type}->{'high_priority_override'};
my $curation_tag = "ATP:0000353"; # high priority data
my $note = "$high_priority_mapping->{$flag_type}";
my $debugging_note = '';
my $timestamp = $high_priority_flags->{$flag_type}->{$pub_id}->{"$high_priority_mapping->{$flag_type}"}[0];
# set generic defaults that are overwritten later with more specific information
my $curator = 'FB_curator';
my $curation_records = '';
# get curator details for the high priority flag and use to override generic defaults where possible
my $high_priority_details = &get_relevant_curator_from_candidate_list_using_pub_and_timestamp($all_curation_record_data, $pub_id, $timestamp);
if (defined $high_priority_details) {
$curator = "$high_priority_details->{curator}";
$curation_records = "$high_priority_details->{currecs}";
}
# build reference with information for this publication+workflow type combination
my $FBrf_with_prefix="FB:".$FBrf;
$workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{date_created} = $timestamp;
$workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{date_updated} = $timestamp;
$workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{created_by} = $curator;
$workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{updated_by} = $curator;
$workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{mod_abbreviation} = "FB";
$workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{reference_curie} = $FBrf_with_prefix;
$workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{workflow_tag_id} = $ATP;
if ($curation_tag) {
$workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{curation_tag} = $curation_tag;
}
if (exists $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{note}) {
my $existing_note = "$workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{note}";
$workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{note} = "$existing_note, $note";
} else {
$workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{note} = "$note";
}
$workflow_status_data->{$pub_id}->{$workflow_type}->{debugging}->{currecs} = $curation_records;
if ($debugging_note) {
$workflow_status_data->{$pub_id}->{$workflow_type}->{debugging}->{note} = $debugging_note;
}
}
}
}
}
}
}
# try to add relevant publication-level internal notes where appropriates
foreach my $pub_id (sort keys %{$all_candidate_internal_notes}) {
if (exists $workflow_status_data->{$pub_id}) {
foreach my $int_note (sort keys %{$all_candidate_internal_notes->{$pub_id}}) {
my $switch = 0;
my $reformatted_note = &clean_note("$int_note");
$reformatted_note =~ s/\n/ /g;
# for internal notes with a single timestamp
if (scalar @{$all_candidate_internal_notes->{$pub_id}->{$int_note}} == 1) {
my $int_note_timestamp = join '', @{$all_candidate_internal_notes->{$pub_id}->{$int_note}};
# A. first pass - go through different workflow types, from least to most detailed curation, and match up internal notes using timestamp and curation record info
# 1. go through the different workflow_tags to find cases where both the timestamp and curation records show a match
foreach my $workflow_type (sort keys %{$workflow_status_data->{$pub_id}}) {
unless ($switch) {
my $workflow_type_timestamp = "$workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{date_created}";
my $workflow_type_currecs = "$workflow_status_data->{$pub_id}->{$workflow_type}->{debugging}->{currecs}";
my $workflow_type_curator = "$workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{created_by}";
my $int_note_curator_details = &get_relevant_curator_from_candidate_list_using_pub_and_timestamp($all_curation_record_data, $pub_id, $int_note_timestamp);
# 2. if the timestamps of the workflow tag and internal note match
if ($int_note_timestamp eq $workflow_type_timestamp) {
# 3. get the curation record details for the internal note to check against those of workflow_tag
if (defined $int_note_curator_details) {
my $int_note_currecs = "$int_note_curator_details->{currecs}";
my $int_note_curator = "$int_note_curator_details->{curator}";
# 4. if the curation record for the workflow type is in the list of possibilities for the internal note
# and there is only one curator possibility for that timestamp
# then the internal note can be added to the entry for that workflow tag
if ($int_note_currecs =~ m/$workflow_type_currecs/ && $int_note_currecs ne 'multiple curators for same timestamp') {
$switch++;
if (exists $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{note}) {
my $note = "$workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{note}";
$workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{note} = &clean_note("$note||$int_note");
} else {
$workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{note} = &clean_note("$int_note");
}
}
}
} else {
# if the timestamps do not match, but the curation record filename matches the type, its still OK to add the note
foreach my $relevant_record_type (@{$workflow_tag_mapping->{$workflow_type}->{'relevant_record_type'}}) {
unless ($switch) {
if (defined $int_note_curator_details) {
if ($int_note_curator_details->{currecs} =~ m/$relevant_record_type/) {
$switch++;
if (exists $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{note}) {
my $existing_note = "$workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{note}";
$workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{note} = &clean_note("$existing_note||$int_note");
} else {
$workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{note} = &clean_note("$int_note");
}
}
}
}
}
}
}
}
# B. second pass, if the internal note did not match in the loop above, add it to the note for the workflow_type if appropriate, so that the information gets into the Alliance
# (Many of the non-matching notes are edits to original curation, so makes sense to add to either skim/manual_indexing workflow types).
foreach my $workflow_type (reverse sort keys %{$workflow_tag_mapping}) {
if (exists $workflow_tag_mapping->{$workflow_type}->{'second_pass'}) {
unless ($switch) {
if (exists $workflow_status_data->{$pub_id}->{$workflow_type}) {
my $workflow_type_timestamp = "$workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{date_created}";
my $workflow_type_currecs = "$workflow_status_data->{$pub_id}->{$workflow_type}->{debugging}->{currecs}";
my $workflow_type_curator = "$workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{created_by}";
my $int_note_curator_details = &get_relevant_curator_from_candidate_list_using_pub_and_timestamp($all_curation_record_data, $pub_id, $int_note_timestamp);
if (defined $int_note_curator_details) {
my $int_note_currecs = "$int_note_curator_details->{currecs}";
my $int_note_curator = "$int_note_curator_details->{curator}";
my $debugging_note = "ADDED in second pass: $int_note ($int_note_curator, $int_note_currecs)";
$debugging_note =~ s/\n/ /g;
$switch++;
if (exists $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{note}) {
my $note = "$workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{note}";
$workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{note} = &clean_note("$note||$int_note");
} else {
$workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{note} = &clean_note("$int_note");
}
if (exists $workflow_status_data->{$pub_id}->{$workflow_type}->{debugging}->{note}) {
my $existing_note = "$workflow_status_data->{$pub_id}->{$workflow_type}->{debugging}->{note}";
$workflow_status_data->{$pub_id}->{$workflow_type}->{debugging}->{note} = &clean_note("$existing_note||$debugging_note");
} else {
$workflow_status_data->{$pub_id}->{$workflow_type}->{debugging}->{note} = &clean_note("$debugging_note");
}
}
}
}
}
}
} else {
print $data_error_file "WARNING: internal note(s) with MULTIPLE TIMESTAMPS: $pub_id\t$pub_id_to_FBrf->{$pub_id}->{'FBrf'}\t$pub_id_to_FBrf->{$pub_id}->{'type'}\t$reformatted_note\n";
}
unless ($switch) {
print $data_error_file "WARNING: internal note(s) that could not match up: $pub_id\t$pub_id_to_FBrf->{$pub_id}->{'FBrf'}\t$pub_id_to_FBrf->{$pub_id}->{'type'}\t$reformatted_note\n";
}
}
}
}
#print Dumper ($workflow_status_data);
my $complete_data = {};
foreach my $pub_id (sort keys %{$workflow_status_data}) {
foreach my $workflow_type (sort keys %{$workflow_status_data->{$pub_id}}) {
my $FBrf = $pub_id_to_FBrf->{$pub_id}->{'FBrf'};
my $pub_type = $pub_id_to_FBrf->{$pub_id}->{'type'};
#store data for making json later
push @{$complete_data->{data}}, $workflow_status_data->{$pub_id}->{$workflow_type}->{json};
# simple output for testing/debugging
my $curated_by = exists $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{'created_by'} ? "$workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{'created_by'}" : '';
my $updated_by = exists $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{'updated_by'} ? "$workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{'updated_by'}" : '';
my $workflow_tag_id = exists $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{'workflow_tag_id'} ? $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{'workflow_tag_id'} : '';
my $date_created = exists $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{'date_created'} ? $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{'date_created'} : '';
my $curation_tag = exists $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{'curation_tag'} ? $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{'curation_tag'} : '';
my $note = exists $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{'note'} ? $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{'note'} : '';
$note =~ s/\n/ /g;
my $curation_records = exists $workflow_status_data->{$pub_id}->{$workflow_type}->{debugging}->{currecs} ? $workflow_status_data->{$pub_id}->{$workflow_type}->{debugging}->{currecs} : '';
my $debugging_note = exists $workflow_status_data->{$pub_id}->{$workflow_type}->{debugging}->{note} ? $workflow_status_data->{$pub_id}->{$workflow_type}->{debugging}->{note} : '';
my $relevant_record_type = exists $workflow_status_data->{$pub_id}->{$workflow_type}->{debugging}->{relevant_record_type} ? $workflow_status_data->{$pub_id}->{$workflow_type}->{debugging}->{relevant_record_type} : '';
print $plain_output_file "DATA:$pub_id\t$FBrf\t$pub_type\t$relevant_record_type\t$curated_by\t$curation_records\t$workflow_tag_id\t$curation_tag\t$date_created\t$note\t$debugging_note\n";
}
}
#print Dumper ($complete_data);
# convert any curator names as needed
$complete_data->{data} = &convert_curator_names_bulk($complete_data->{data});
unless ($ENV_STATE eq "test") {
my $json_metadata = &make_abc_json_metadata($db, $api_endpoint);
$complete_data->{"metaData"} = $json_metadata;
my $complete_json_data = $json_encoder->encode($complete_data);
print $json_output_file $complete_json_data;
} else {
foreach my $element (@{$complete_data->{"data"}}) {
my $json_element = $json_encoder->encode($element);
my $cmd="curl -X 'POST' 'https://stage-literature-rest.alliancegenome.org/$api_endpoint/' -H 'accept: application/json' -H 'Authorization: Bearer $access_token' -H 'Content-Type: application/json' -d '$json_element'";
my $raw_result = `$cmd`;
if ($raw_result =~ m/^\d+$/) {
print $plain_output_file "json post success\nJSON:\n$json_element\n\n";
} else {
print $process_error_file "json post failed\nJSON:\n$json_element\nREASON:\n$raw_result\n#################################\n\n";
}
}
}
print $plain_output_file "##Ended processing: " . (scalar localtime) . "\n";
close $json_output_file;
close $data_error_file;
close $process_error_file;
close $plain_output_file;