-
Notifications
You must be signed in to change notification settings - Fork 6
/
download_abide_preproc.py
247 lines (216 loc) · 8.91 KB
/
download_abide_preproc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
# download_abide_preproc.py
#
# Author: Daniel Clark, 2015
'''
This script downloads data from the Preprocessed Connetomes Project's
ABIDE Preprocessed data release and stores the files in a local
directory; users specify derivative, pipeline, strategy, and optionally
age ranges, sex, site of interest
Usage:
python download_abide_preproc.py -d <derivative> -p <pipeline>
-s <strategy> -o <out_dir>
[-lt <less_than>] [-gt <greater_than>]
[-x <sex>] [-t <site>]
'''
# Main collect and download function
def collect_and_download(derivative, pipeline, strategy, out_dir,
less_than, greater_than, site, sex):
'''
Function to collect and download images from the ABIDE preprocessed
directory on FCP-INDI's S3 bucket
Parameters
----------
derivative : string
derivative or measure of interest
pipeline : string
pipeline used to process data of interest
strategy : string
noise removal strategy used to process data of interest
out_dir : string
filepath to a local directory to save files to
less_than : float
upper age (years) threshold for participants of interest
greater_than : float
lower age (years) threshold for participants of interest
site : string
acquisition site of interest
sex : string
'M' or 'F' to indicate whether to download male or female data
Returns
-------
None
this function does not return a value; it downloads data from
S3 to a local directory
'''
# Import packages
import os
import urllib
# Init variables
mean_fd_thresh = 0.2
s3_prefix = 'https://s3.amazonaws.com/fcp-indi/data/Projects/'\
'ABIDE_Initiative'
s3_pheno_path = '/'.join([s3_prefix, 'Phenotypic_V1_0b_preprocessed1.csv'])
# Format input arguments to be lower case, if not already
derivative = derivative.lower()
pipeline = pipeline.lower()
strategy = strategy.lower()
# Check derivative for extension
if 'roi' in derivative:
extension = '.1D'
else:
extension = '.nii.gz'
# If output path doesn't exist, create it
if not os.path.exists(out_dir):
print 'Could not find %s, creating now...' % out_dir
os.makedirs(out_dir)
# Load the phenotype file from S3
s3_pheno_file = urllib.urlopen(s3_pheno_path)
pheno_list = s3_pheno_file.readlines()
# Get header indices
header = pheno_list[0].split(',')
try:
site_idx = header.index('SITE_ID')
file_idx = header.index('FILE_ID')
age_idx = header.index('AGE_AT_SCAN')
sex_idx = header.index('SEX')
mean_fd_idx = header.index('func_mean_fd')
except Exception as exc:
err_msg = 'Unable to extract header information from the pheno file: %s'\
'\nHeader should have pheno info: %s\nError: %s'\
% (s3_pheno_path, str(header), exc)
raise Exception(err_msg)
# Go through pheno file and build download paths
print 'Collecting images of interest...'
s3_paths = []
for pheno_row in pheno_list[1:]:
# Comma separate the row
cs_row = pheno_row.split(',')
try:
# See if it was preprocessed
row_file_id = cs_row[file_idx]
# Read in participant info
row_site = cs_row[site_idx]
row_age = float(cs_row[age_idx])
row_sex = cs_row[sex_idx]
row_mean_fd = float(cs_row[mean_fd_idx])
except Exception as exc:
err_msg = 'Error extracting info from phenotypic file, skipping...'
print err_msg
continue
# If the filename isn't specified, skip
if row_file_id == 'no_filename':
continue
# If mean fd is too large, skip
if row_mean_fd >= mean_fd_thresh:
continue
# Test phenotypic criteria (three if's looks cleaner than one long if)
# Test sex
if (sex == 'M' and row_sex != '1') or (sex == 'F' and row_sex != '2'):
continue
# Test site
if (site is not None and site.lower() != row_site.lower()):
continue
# Test age range
if row_age < less_than and row_age > greater_than:
filename = row_file_id + '_' + derivative + extension
s3_path = '/'.join([s3_prefix, 'Outputs', pipeline, strategy,
derivative, filename])
print 'Adding %s to download queue...' % s3_path
s3_paths.append(s3_path)
else:
continue
# And download the items
total_num_files = len(s3_paths)
for path_idx, s3_path in enumerate(s3_paths):
rel_path = s3_path.lstrip(s3_prefix)
download_file = os.path.join(out_dir, rel_path)
download_dir = os.path.dirname(download_file)
if not os.path.exists(download_dir):
os.makedirs(download_dir)
try:
if not os.path.exists(download_file):
print 'Retrieving: %s' % download_file
urllib.urlretrieve(s3_path, download_file)
print '%.3f%% percent complete' % \
(100*(float(path_idx+1)/total_num_files))
else:
print 'File %s already exists, skipping...' % download_file
except Exception as exc:
print 'There was a problem downloading %s.\n'\
'Check input arguments and try again.' % s3_path
# Print all done
print 'Done!'
# Make module executable
if __name__ == '__main__':
# Import packages
import argparse
import os
import sys
# Init arparser
parser = argparse.ArgumentParser(description=__doc__)
# Required arguments
parser.add_argument('-d', '--derivative', nargs=1, required=True, type=str,
help='Derivative of interest (e.g. \'reho\')')
parser.add_argument('-p', '--pipeline', nargs=1, required=True, type=str,
help='Pipeline used to preprocess the data '\
'(e.g. \'cpac\')')
parser.add_argument('-s', '--strategy', nargs=1, required=True, type=str,
help='Noise-removal strategy used during preprocessing '\
'(e.g. \'nofilt_noglobal\'')
parser.add_argument('-o', '--out_dir', nargs=1, required=True, type=str,
help='Path to local folder to download files to')
# Optional arguments
parser.add_argument('-lt', '--less_than', nargs=1, required=False,
type=float, help='Upper age threshold (in years) of '\
'particpants to download (e.g. for '\
'subjects 30 or younger, \'-lt 31\')')
parser.add_argument('-gt', '--greater_than', nargs=1, required=False,
type=int, help='Lower age threshold (in years) of '\
'particpants to download (e.g. for '\
'subjects 31 or older, \'-gt 30\')')
parser.add_argument('-t', '--site', nargs=1, required=False, type=str,
help='Site of interest to download from '\
'(e.g. \'Caltech\'')
parser.add_argument('-x', '--sex', nargs=1, required=False, type=str,
help='Participant sex of interest to download only '\
'(e.g. \'M\' or \'F\')')
# Parse and gather arguments
args = parser.parse_args()
# Init variables
derivative = args.derivative[0].lower()
pipeline = args.pipeline[0].lower()
strategy = args.strategy[0].lower()
out_dir = os.path.abspath(args.out_dir[0])
# Try and init optional arguments
try:
less_than = args.less_than[0]
print 'Using upper age threshold of %d...' % less_than
except TypeError as exc:
less_than = 200.0
print 'No upper age threshold specified'
try:
greater_than = args.greater_than[0]
print 'Using lower age threshold of %d...' % less_than
except TypeError as exc:
greater_than = -1.0
print 'No lower age threshold specified'
try:
site = args.site[0]
except TypeError as exc:
site = None
print 'No site specified, using all sites...'
try:
sex = args.sex[0].upper()
if sex == 'M':
print 'Downloading only male subjects...'
elif sex == 'F':
print 'Downloading only female subjects...'
else:
print 'Please specify \'M\' or \'F\' for sex and try again'
sys.exit()
except TypeError as exc:
sex = None
print 'No sex specified, using all sexes...'
# Call the collect and download routine
collect_and_download(derivative, pipeline, strategy,out_dir,
less_than, greater_than, site, sex)