-
Notifications
You must be signed in to change notification settings - Fork 0
/
data_loader.py
113 lines (80 loc) · 3.2 KB
/
data_loader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.model_selection import train_test_split
import copy
def split_data(feature, label, test_ratio=0.2):
train_x, test_x, train_y, test_y = train_test_split(feature,
label,
test_size=test_ratio,
random_state=42)
return train_x, test_x, train_y, test_y
def avg_model(
model1,
model2,
key_list=[
'tabnet.encoder.att_transformers.0.fc.weight',
'tabnet.encoder.feat_transformers.2.specifics.glu_layers.1.fc.weight',
'tabnet.encoder.feat_transformers.2.specifics.glu_layers.0.fc.weight'
]):
network1 = model1.network.state_dict()
network2 = model2.network.state_dict()
i = 0
keys = network1.keys()
for tmp_key in keys:
print("==========", network1[tmp_key], network2[tmp_key])
network1[tmp_key] = (network1[tmp_key] + network2[tmp_key]) / 2.0
network2[tmp_key] = (network1[tmp_key] + network2[tmp_key]) / 2.0
i += 1
if i > 20:
break
return network1, network2
def train_metrics(model, test_x, test_y):
preds = model.predict(test_x.values)
acc = (preds == test_y.values).astype(int).sum() / len(preds)
# print("feature_importances_: ", clf.feature_importances_,
# clf.feature_importances_.shape)
recall = (preds * test_y.values).sum() / (test_y.values
== 1).astype(int).sum()
prob = model.predict_proba(test_x.values)
# print("prob: ", prob)
return recall, acc
def client_tabnet(new_model, client_x, client_y, params=None):
# new_model = copy.deepcopy(model)
new_model.fit(client_x.values, client_y.values, max_epochs=1)
print("feature importance: ", new_model.feature_importances_)
return new_model
def test_tabnet(data_path):
print("start")
data = pd.read_csv(data_path,
engine='python',
encoding="utf-8",
header=None)
# extract label
label = data.pop(0)
print("label dict:", Counter(label))
# extract pos-inds
x = data.copy()
# set float and int column-index
# float_inds = np.arange(9, 30).tolist()
float_inds = np.arange(11, 25).tolist() + [9]
#float_inds = np.arange(9, 40).tolist()
int_inds = np.arange(1, 9).tolist()
index_0 = label == 0
index_1 = label == 1
for col in float_inds:
mean_1 = x[col].mean()
x[col].fillna(mean_1, inplace=True)
# print("----------", x)
train_x, test_x, train_y, test_y = split_data(x[int_inds + float_inds],
label,
test_ratio=0.2)
# x0, x1, y0, y1 = split_data(train_x, train_y, test_ratio=0.5)
x0 = train_x.iloc[:200,]
x1 = train_x.iloc[200:,]
y0 = train_y[:200]
y1 = train_y[200:]
return x0, x1, y0, y1, test_x, test_y
data_path = "complex_disease.csv"
# data_path = "/home/yawei/tableNet/complex_disease.csv"
x0, x1, y0, y1, test_x, test_y = test_tabnet(data_path)