-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpre_processing.py
More file actions
81 lines (63 loc) · 2 KB
/
pre_processing.py
File metadata and controls
81 lines (63 loc) · 2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import numpy as np
import os
# Pre processing:read and load fasta file ,
# and transform a fasta file into the format that pytorch recognized.
def read_seq_graphprot(seq_file, label = 1):
"""
load fasta file.
"""
seq_list = []
labels = []
seq = ''
with open(seq_file, 'r') as fp:
for line in fp:
if line[0] == '>':
name = line[1:-1]
else:
seq = line[:-1].upper()
seq = seq.replace('T', 'U') ## let T to U
seq_list.append(seq)
labels.append(label)
return seq_list, labels
def read_data_file(posifile, negafile = None, train = True):
"""
save a positive or negative label sample into dict.
"""
data = dict()
seqs, labels = read_seq_graphprot(posifile, label = 1)
if negafile:
seqs2, labels2 = read_seq_graphprot(negafile, label = 0)
seqs = seqs + seqs2
labels = labels + labels2
data["seq"] = seqs
data["Y"] = np.array(labels)
return data
##path需要改成自己存放数据的目录
def load_graphprot_data(protein, train = True, path = './data'):
"""
load data with positive and negative protein sample in dict.
"""
data = dict()
tmp = []
listfiles = os.listdir(path)
key = '.train.'
if not train:
key = '.ls.'
mix_label = []
mix_seq = []
mix_structure = []
for tmpfile in listfiles:
if protein not in tmpfile:
continue
if key in tmpfile:
if 'positive' in tmpfile:
label = 1
else:
label = 0
seqs, labels = read_seq_graphprot(os.path.join(path, tmpfile), label = label)
#pdb.set_trace()
mix_label = mix_label + labels
mix_seq = mix_seq + seqs
data["seq"] = mix_seq
data["Y"] = np.array(mix_label)
return data