-
Notifications
You must be signed in to change notification settings - Fork 3
/
build_dataset_container.py
186 lines (167 loc) · 7.74 KB
/
build_dataset_container.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
import pandas as pd
import numpy as np
import torch
import pickle
from sklearn import preprocessing
from sklearn.model_selection import StratifiedKFold
def load_network(file_path):
"""
Load network from file.
:param file_path: Full pathname of the network file
:return: net (class: pandas.DataFrame): Edges in the network, nodes (class: pandas.DataFrame): The nodes in the network
"""
net = pd.read_table(filepath_or_buffer=file_path, header=None,
index_col=None, names=['source', 'target'], sep='\t')
nodes = pd.concat([net['source'], net['target']], ignore_index=True)
nodes = pd.DataFrame(nodes, columns=['nodes']).drop_duplicates()
nodes.reset_index(drop=True, inplace=True)
return net, nodes
def build_customized_feature_matrix(feat_file_lst, network_file, feat_name_lst):
"""
Build feature matrix on your own data.
:param feat_file_lst: List of full pathnames of feature files. Each feat_file in feat_file_lst contains two columns, i.e., gene names and feature values.
:param network_file: Full pathname of network file
:param feat_name_lst: List of feature names
:return: Concatenated feature matrix with n rows(genes) and m columns(features) (class: pandas.DataFrame)
"""
feat_dic = dict()
# Load gene features from each feat_file
for i in range(0, len(feat_file_lst)):
feat_dic[feat_name_lst[i]] = pd.read_csv(feat_file_lst[i], sep='\t', index_col=0)
# Load network from file
net, net_nodes = load_network(network_file)
# Normalization by MinMaxScaler
scaler = preprocessing.MinMaxScaler()
feat_raw = scaler.fit_transform(np.abs(feat_dic[feat_name_lst[0]].reindex(net_nodes['nodes'].values.tolist(), fill_value=0)))
# Concatenate multiple features to form one feature matrix
if len(feat_file_lst) > 1:
for i in range(1,len(feat_file_lst)):
feat_raw = np.concatenate((feat_raw, scaler.fit_transform(np.abs(feat_dic[feat_name_lst[i]].reindex(net_nodes['nodes'].values.tolist(), fill_value=0)))), axis=1)
return pd.DataFrame(feat_raw,index=net_nodes['nodes'].values.tolist(),columns=feat_name_lst)
def create_edge_index(network_file,net_features):
"""
Convert the edges in a network into edges indexed by integer ids, which is necessary to build an object typeof torch_geometric.data.Data.
:param network_file: Full pathname of the network file
:param net_features (class: pandas.DataFrame): Concatenated feature matrix with n rows(genes) and m columns(features)
:return (class: pandas.DataFrame): Edges indexed by integer ids
"""
net, _ = load_network(network_file)
node_df = pd.DataFrame({'name':net_features.index.values.tolist(),
'id':[i for i in np.arange(0,net_features.shape[0])]})
net = pd.merge(left=net,right=node_df,how='left',left_on='source',right_on='name')
net.columns=['source','target','sourcename','sourceid']
net = pd.merge(left=net, right=node_df, how='left',left_on='target',right_on='name')
net.columns=['source','target','sourcename','sourceid','targetname','targetid']
edge_index1 = net.loc[:,['sourceid','targetid']]
# Treat the graph as undirected graph
edge_index2 = net.loc[:,['targetid','sourceid']]
edge_index = pd.concat([edge_index1,edge_index2],axis=0)
return edge_index
def generate_5CV_set(drivers,nondrivers,randseed):
"""
Generate 5CV splits.
:param drivers: List of canonical driver genes(positive samples)
:param nondrivers: List of nondriver genes(negative samples)
:param randseed: Random seed
:return: 5CV splits sorted in a dictionary
"""
# StratifiedKFold
X, y = drivers + nondrivers, np.hstack(([1]*len(drivers), [0]*len(nondrivers)))
skf = StratifiedKFold(n_splits=5,shuffle=True,random_state=randseed)
X_5CV = {}
cv_idx=1
for train, test in skf.split(X, y):
# train/test sorts the sample indices in X list.
# For each split, we should convert the indices in train/test to names
train_set=[]
train_label=[]
test_set=[]
test_label=[]
for i in train:
train_set.append(X[i])
train_label.append(y[i])
for i in test:
test_set.append(X[i])
test_label.append(y[i])
X_5CV['train_%d' % cv_idx] = train_set
X_5CV['test_%d' % cv_idx] = test_set
X_5CV['train_label_%d' % cv_idx] = train_label
X_5CV['test_label_%d' % cv_idx] = test_label
cv_idx = cv_idx + 1
return X_5CV
feat_file_lst = ['./preprocess_data/gene_mutation/MF_BLCA_mutation_matrix.tsv',
'./preprocess_data/gene_expression/GE_BLCA_expression_matrix.tsv',
'./preprocess_data/DNA_methylation/METH_BLCA_methylation_RATIO_mean.tsv']
network_file = './data/PathNet/PathNet.txt'
feat_name_lst = ['mut','exp','methy']
# Concatenate multiple features to form one feature matrix
net_features = build_customized_feature_matrix(feat_file_lst, network_file, feat_name_lst)
# A dataset contains the following data:
# feature: the gene feature matrix
# edge_index: graph edges for training model
# node_name: gene names
# feature_name: feature names
# label: True labels of genes (0 for negative samples and 1 for positive samples),
# k_sets: 5CV splits that randomly generated for ten times
# mask: mask for training a single model without cross-validation
dataset=dict()
dataset['feature'] = torch.FloatTensor(np.array(net_features))
dataset['node_name'] = net_features.index.values.tolist()
# Create edge_index by edges in network file
edge_index = create_edge_index(network_file,net_features)
dataset['edge_index'] = torch.LongTensor(np.array(edge_index).transpose())
dataset['feature_name'] = net_features.columns.values.tolist()
# Generate 10 rounds 5CV splits
# Canonical driver genes (positive samples)
d_lst = pd.read_table(filepath_or_buffer='./data/796_drivers.txt', sep='\t', header=None, index_col=None,
names=['driver'])
d_lst = d_lst['driver'].values.tolist()
# Nondriver genes (negative samples)
nd_lst = pd.read_table(filepath_or_buffer='./data/2187_nondrivers.txt', sep='\t', header=None,
index_col=None, names=['nondriver'])
nd_lst = nd_lst['nondriver'].values.tolist()
# True labels of genes
labels = []
mask = [] # mask for training a single model without cross-validation
for g in dataset['node_name']:
if g in d_lst:
labels.append(1)
else:
labels.append(0)
if (g in d_lst) or (g in nd_lst):
mask.append(True)
else:
mask.append(False)
d_in_net = [] # Canonical driver genes in the network
nd_in_net = [] # Nondriver genes in the network
for g in dataset['node_name']:
if g in d_lst:
d_in_net.append(g)
elif g in nd_lst:
nd_in_net.append(g)
k_sets_net = dict()
for k in np.arange(0,10): # Randomly generate 5CV splits for ten times
k_sets_net[k] = []
randseed = (k+1)%100+(k+1)*5
cv = generate_5CV_set(d_in_net,nd_in_net,randseed)
for cv_idx in np.arange(1,6):
tr_mask = [] # train mask
te_mask = [] # test mask
for g in dataset['node_name']:
if g in cv['train_%d' % cv_idx]:
tr_mask.append(True)
else:
tr_mask.append(False)
if g in cv['test_%d' % cv_idx]:
te_mask.append(True)
else:
te_mask.append(False)
tr_mask = np.array(tr_mask)
te_mask = np.array(te_mask)
k_sets_net[k].append((tr_mask,te_mask))
dataset['label'] = torch.FloatTensor(np.array(labels))
dataset['split_set'] = k_sets_net
dataset['mask'] = np.array(mask)
# Save the dataset as pickle file, which can be used for training HGDC
with open('./data/dataset_BLCA_ten_5CV.pkl', 'wb') as f:
pickle.dump(dataset, f, pickle.HIGHEST_PROTOCOL)