-
Notifications
You must be signed in to change notification settings - Fork 0
/
genes_from_top_pathways.py
101 lines (87 loc) · 4.07 KB
/
genes_from_top_pathways.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
#!/usr/bin/python3
# -*- coding: utf-8 -*-
import pandas as pd
import os
MIN_GENES_IN_PATH = 1
sample = "CSF"
folder_paths = [f"/data/ashevtsov/MS_data/{sample}/oncobox_res"]
out_top10_path_each_celltype_path = f"/data/ashevtsov/MS_data/{sample}/oncobox_res/merged/{sample}_top10_all_cells_bipathways_min{MIN_GENES_IN_PATH}genes.csv"
out_genes_top10_path_each_celltype_path = f"/data/ashevtsov/MS_data/{sample}/oncobox_res/merged/{sample}_genes_from_top10_all_cells_bipathways_min{MIN_GENES_IN_PATH}genes.csv"
def find_genes_by_path(path):
genes_path_i = []
# need to rename reactome and biocarta databases foldes to lowercase and KEGG adjasted to KEGG!
if path.split("_")[0] in ["NCI", "biocarta", "reactome", "KEGG"]:
df = pd.read_csv(os.path.join("/data/ashevtsov/oncoboxlib/databases/" + path.split("_")[0] + " 1.123/arr.csv"))
a = df[df[path] > 0]["gene"].to_numpy()
for i in a:
genes_path_i.append(i)
else:
try:
df = pd.read_csv(os.path.join("/data/ashevtsov/oncoboxlib/databases/" + "Metabolism" + " 1.123/arr.csv"))
a = df[df[path] > 0]["gene"].to_numpy()
for i in a:
genes_path_i.append(i)
except:
df = pd.read_csv(os.path.join("/data/ashevtsov/oncoboxlib/databases/" + "Qiagen 1.123" + "/arr.csv"))
a = df[df[path] > 0]["gene"].to_numpy()
for i in a:
genes_path_i.append(i)
return genes_path_i
def make_list_genes_of_minNgene_pathways(path, N, ascending):
# print(path)
check = 0
genes_for_each_path = {}
df = pd.read_csv(path)
pathways = []
df = df.sort_values(by="Tumour_geomean", ascending= ascending)
i = 0
while check < 10:
biopath = df.iloc[i, 0]
# print(biopath)
i += 1
genes_for_path = find_genes_by_path(biopath)
if len(genes_for_path) >= N:
pathways.append(biopath)
print(f"Added {biopath}, number genes - {len(genes_for_path)}")
genes_for_each_path[biopath] = genes_for_path
check += 1
else:
print(biopath, "NOT enough genes", "got-", len(genes_for_path), "need-", N)
genes = []
for i in genes_for_each_path.values():
genes += i
return genes_for_each_path, genes, pathways
# d = {}
#
#
# for folder_path in folder_paths:
# for file in os.listdir(folder_path):
# if "merged" not in file:
# file_path = os.path.join(folder_path, file)
# results = pd.read_csv( file_path, sep = ",")
# # print(results)
# top_pos_pathways = list((results.sort_values("Tumour_geomean",ascending = False).head(10))["pathway"])
# top_neg_pathways = list((results.sort_values("Tumour_geomean",ascending = True).head(10))["pathway"])
#
# d[file[:-4] + "_pos"] = top_pos_pathways
# d[file[:-4] + "_neg"] = top_neg_pathways
#
# df = pd.DataFrame(d)
# df.to_csv(out_top10_path_each_celltype_path)
genes_csf_all = {}
pathways_all = {}
for folder_path in folder_paths:
for file in os.listdir(folder_path):
if "merged" not in file:
print("-----------",file, "--------------")
path = os.path.join(folder_path, file)
genes_for_each_path, genes_pos, pathways_pos = make_list_genes_of_minNgene_pathways(path, MIN_GENES_IN_PATH, False)
genes_csf_all[file[:-4] + "_pos"] = genes_pos
pathways_all[file[:-4] + "_pos"] = pathways_pos
genes_for_each_path, genes_neg, pathways_neg = make_list_genes_of_minNgene_pathways(path, MIN_GENES_IN_PATH, True)
genes_csf_all[file[:-4] + "_neg"] = genes_neg
pathways_all[file[:-4] + "_neg"] = pathways_neg
df_genes = pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in genes_csf_all.items() ]))
df_genes.to_csv(out_genes_top10_path_each_celltype_path)
df_paths = pd.DataFrame(pathways_all)
df_paths.to_csv(out_top10_path_each_celltype_path)