UCAS_Deeplearning_Project/pre_processing.py at master · SZJShuffle/UCAS_Deeplearning_Project · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import numpy as np
import os
# Pre processing:read and load fasta file ,
# and transform a fasta file into the format that pytorch recognized.


def read_seq_graphprot(seq_file, label = 1):
    """
    load fasta file.
    """
    seq_list = []
    labels = []
    seq = ''
    with open(seq_file, 'r') as fp:
        for line in fp:
            if line[0] == '>':
                name = line[1:-1]
            else:
                seq = line[:-1].upper()
                seq = seq.replace('T', 'U')   ## let T to U
                seq_list.append(seq)
                labels.append(label)

    return seq_list, labels


def read_data_file(posifile, negafile = None, train = True):
    """
    save a positive or negative label sample into dict.

    """
    data = dict()
    seqs, labels = read_seq_graphprot(posifile, label = 1)
    if negafile:
        seqs2, labels2 = read_seq_graphprot(negafile, label = 0)
        seqs = seqs + seqs2
        labels = labels + labels2

    data["seq"] = seqs
    data["Y"] = np.array(labels)

    return data


##path需要改成自己存放数据的目录
def load_graphprot_data(protein, train = True, path = './data'):
    """
    load data with positive and negative protein sample in dict.

    """
    data = dict()
    tmp = []
    listfiles = os.listdir(path)

    key = '.train.'
    if not train:
        key = '.ls.'
    mix_label = []
    mix_seq = []
    mix_structure = []
    for tmpfile in listfiles:
        if protein not in tmpfile:
            continue
        if key in tmpfile:
            if 'positive' in tmpfile:
                label = 1
            else:
                label = 0
            seqs, labels = read_seq_graphprot(os.path.join(path, tmpfile), label = label)
            #pdb.set_trace()
            mix_label = mix_label + labels
            mix_seq = mix_seq + seqs

    data["seq"] = mix_seq
    data["Y"] = np.array(mix_label)

    return data