Skip to content

Commit

Permalink
mask working w index, masking extract trouble
Browse files Browse the repository at this point in the history
  • Loading branch information
Michael Mann authored and Michael Mann committed Dec 28, 2018
1 parent 331d3ff commit 42a6012
Show file tree
Hide file tree
Showing 3 changed files with 177 additions and 162 deletions.
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,5 @@ numpy>=1.14.0
scikit-learn>=0.19.1
pandas>=0.22.0
geojson>=2.2.0
rasterio>=1.0.2
rasterio>=1.0.2
dask>=1.0.0
272 changes: 136 additions & 136 deletions tsraster/calculate.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@
from tsfresh import extract_features
from tsfresh.utilities.distribution import MultiprocessingDistributor, LocalDaskDistributor
from tsfresh.feature_selection.relevance import calculate_relevance_table as crt
from tsraster.prep import image_to_series, image_to_array, read_images, image_to_series2
from tsraster.prep import image_to_series, image_to_array, read_images
from tsfresh.utilities.distribution import LocalDaskDistributor


def CreateTiff(Name, Array, driver, NDV, GeoT, Proj, DataType, path):
Expand Down Expand Up @@ -71,7 +72,7 @@ def calculateFeatures(path, parameters, reset_df, tiff_output=True):
print('df: '+os.path.join(path,'my_df.csv'))
my_df.to_csv(os.path.join(path,'my_df.csv'), chunksize=10000, index=False)

Distributor = MultiprocessingDistributor(n_workers=6,
Distributor = MultiprocessingDistributor(n_workers=2,
disable_progressbar=False,
progressbar_title="Feature Extraction")

Expand All @@ -80,9 +81,11 @@ def calculateFeatures(path, parameters, reset_df, tiff_output=True):
default_fc_parameters=parameters,
column_sort="time",
column_value="value",
column_id="id",
column_id="pixel_id",
distributor=Distributor
)
# change index name to match
extracted_features.index.rename('pixel_id',inplace=True)

# deal with output location
out_path = Path(path).parent.joinpath(Path(path).stem+"_features")
Expand All @@ -106,9 +109,6 @@ def calculateFeatures(path, parameters, reset_df, tiff_output=True):

# write out features to tiff file
if tiff_output == False:

'''tiff_output is true and by default exports tiff '''

return extracted_features
else:
# get image dimension from raw data
Expand Down Expand Up @@ -136,132 +136,134 @@ def calculateFeatures(path, parameters, reset_df, tiff_output=True):
return extracted_features


def calculateFeatures2(path, parameters, mask=None, reset_df=True, tiff_output=True, missing_value =-9999,workers=2):
'''
Calculates features or the statistical characteristics of time-series raster data.
It can also save features as a csv file (dataframe) and/or tiff file.
:param path: directory path to the raster files
:param parameters: a dictionary of features to be extracted
:param reset_df: boolean option for existing raster inputs as dataframe
:param tiff_output: boolean option for exporting tiff file
:return: extracted features as a dataframe and tiff file
'''

if reset_df == False:
#if reset_df =F read in csv file holding saved version of my_df
df_long = pd.read_csv(os.path.join(path,'df_long.csv'))

# create example of original df to help unmask
df_original = pd.read_csv(os.path.join(path,'df_original.csv') )
df_original = pd.DataFrame(index = pd.RangeIndex(start=0,
stop=len(df_original),
step=1),
dtype=np.float32)

# set index name to pixel id
df_original.index.names = ['pixel_id']

else:
#if reset_df =T calculate ts_series and save csv
df_long, df_original = image_to_series2(path,
mask)

print('df: '+os.path.join(path,'df_long.csv'))
df_long.to_csv(os.path.join(path,'df_long.csv'),
chunksize=10000,
index=False)

df_original.to_csv(os.path.join(path,'df_original.csv'),
chunksize=10000,
index=True)

# remove missing values from df_long
df_long = df_long[df_long['value'] != missing_value]

# check if the number of observation per pixel are not identical
if ~df_long.groupby(['pixel_id','kind']).kind.count().all():
print('ERROR: the number of observation per pixel are not identical')
print(' fix missing values to have a uniform time series')
print(df_long.groupby(['time']).time.unique())

return(df_long.groupby(['pixel_id','kind']).kind.count().all())


Distributor = MultiprocessingDistributor(n_workers=workers,
disable_progressbar=False,
progressbar_title="Feature Extraction")
#Distributor = LocalDaskDistributor(n_workers=2)

extracted_features = extract_features(df_long,
#chunksize=10e6,
default_fc_parameters=parameters,
column_id="pixel_id",
column_sort="time",
column_kind="kind",
column_value="value",
distributor=Distributor
)

# extracted_features.index is == df_long.pixel_id
extracted_features.index.name= 'pixel_id'


#unmask extracted features to match df_original index
extracted_features = pd.concat( [df_original, extracted_features],
axis=1 )

# fill missing values with correct
extracted_features.fillna(missing_value, inplace=True)


# deal with output location
out_path = Path(path).parent.joinpath(Path(path).stem+"_features")
out_path.mkdir(parents=True, exist_ok=True)

# write out features to csv file
print("features:"+os.path.join(out_path,'extracted_features.csv'))
extracted_features.to_csv(os.path.join(out_path,'extracted_features.csv'), chunksize=10000)

# write data frame
kr = pd.DataFrame(list(extracted_features.columns))
kr.index += 1
kr.index.names = ['band']
kr.columns = ['feature_name']
kr.to_csv(os.path.join(out_path,"features_names.csv"))


# write out features to tiff file
if tiff_output == False:

'''tiff_output is true and by default exports tiff '''

return extracted_features

else:
# get image dimension from raw data
rows, cols, num = image_to_array(path).shape
# get the total number of features extracted
matrix_features = extracted_features.values
num_of_layers = matrix_features.shape[1]

#reshape the dimension of features extracted
f2Array = matrix_features.reshape(rows, cols, num_of_layers)
output_file = 'extracted_features.tiff'

#Get Meta Data from raw data
raw_data = read_images(path)
GeoTransform = raw_data[0].GetGeoTransform()
driver = gdal.GetDriverByName('GTiff')

noData = -9999

Projection = raw_data[0].GetProjectionRef()
DataType = gdal.GDT_Float32

#export tiff
CreateTiff(output_file, f2Array, driver, noData, GeoTransform, Projection, DataType, path=out_path)
return extracted_features
#def calculateFeatures2(path, parameters, mask=None, reset_df=True, tiff_output=True,
# missing_value =-9999,workers=2):
# '''
# Calculates features or the statistical characteristics of time-series raster data.
# It can also save features as a csv file (dataframe) and/or tiff file.
#
# :param path: directory path to the raster files
# :param parameters: a dictionary of features to be extracted
# :param reset_df: boolean option for existing raster inputs as dataframe
# :param tiff_output: boolean option for exporting tiff file
# :return: extracted features as a dataframe and tiff file
# '''
#
# if reset_df == False:
# #if reset_df =F read in csv file holding saved version of my_df
# df_long = pd.read_csv(os.path.join(path,'df_long.csv'))
#
# # create example of original df to help unmask
# df_original = pd.read_csv(os.path.join(path,'df_original.csv') )
# df_original = pd.DataFrame(index = pd.RangeIndex(start=0,
# stop=len(df_original),
# step=1),
# dtype=np.float32)
#
# # set index name to pixel id
# df_original.index.names = ['pixel_id']
#
# else:
# #if reset_df =T calculate ts_series and save csv
# df_long, df_original = image_to_series2(path,
# mask)
#
# print('df: '+os.path.join(path,'df_long.csv'))
# df_long.to_csv(os.path.join(path,'df_long.csv'),
# chunksize=10000,
# index=False)
#
# df_original.to_csv(os.path.join(path,'df_original.csv'),
# chunksize=10000,
# index=True)
#
# # remove missing values from df_long
# df_long = df_long[df_long['value'] != missing_value]
#
# # check if the number of observation per pixel are not identical
# if ~df_long.groupby(['pixel_id','kind']).kind.count().all():
# print('ERROR: the number of observation per pixel are not identical')
# print(' fix missing values to have a uniform time series')
# print(df_long.groupby(['time']).time.unique())
#
# return(df_long.groupby(['pixel_id','kind']).kind.count().all())
#
#
# Distributor = MultiprocessingDistributor(n_workers=workers,
# disable_progressbar=False,
# progressbar_title="Feature Extraction")
# #Distributor = LocalDaskDistributor(n_workers=2)
#
# extracted_features = extract_features(df_long,
# #chunksize=10e6,
# default_fc_parameters=parameters,
# column_id="pixel_id",
# column_sort="time",
# column_kind="kind",
# column_value="value",
# distributor=Distributor
# )
#
# # extracted_features.index is == df_long.pixel_id
# extracted_features.index.name= 'pixel_id'
#
#
# #unmask extracted features to match df_original index
# extracted_features = pd.concat( [df_original, extracted_features],
# axis=1 )
#
# # fill missing values with correct
# extracted_features.fillna(missing_value, inplace=True)
#
#
# # deal with output location
# out_path = Path(path).parent.joinpath(Path(path).stem+"_features")
# out_path.mkdir(parents=True, exist_ok=True)
#
# # write out features to csv file
# print("features:"+os.path.join(out_path,'extracted_features.csv'))
# extracted_features.to_csv(os.path.join(out_path,'extracted_features.csv'), chunksize=10000)
#
# # write data frame
# kr = pd.DataFrame(list(extracted_features.columns))
# kr.index += 1
# kr.index.names = ['band']
# kr.columns = ['feature_name']
# kr.to_csv(os.path.join(out_path,"features_names.csv"))
#
#
# # write out features to tiff file
# if tiff_output == False:
#
# '''tiff_output is true and by default exports tiff '''
#
# return extracted_features
#
# else:
# print('use export_features instead')
# # get image dimension from raw data
# rows, cols, num = image_to_array(path).shape
# # get the total number of features extracted
# matrix_features = extracted_features.values
# num_of_layers = matrix_features.shape[1]
#
# #reshape the dimension of features extracted
# f2Array = matrix_features.reshape(rows, cols, num_of_layers)
# output_file = 'extracted_features.tiff'
#
# #Get Meta Data from raw data
# raw_data = read_images(path)
# GeoTransform = raw_data[0].GetGeoTransform()
# driver = gdal.GetDriverByName('GTiff')
#
# noData = -9999
#
# Projection = raw_data[0].GetProjectionRef()
# DataType = gdal.GDT_Float32
#
# #export tiff
# CreateTiff(output_file, f2Array, driver, noData, GeoTransform, Projection, DataType, path=out_path)
# return extracted_features


def features_to_array(path, input_file):
Expand Down Expand Up @@ -311,8 +313,6 @@ def exportFeatures(path, input_file, output_file,
return export_features




def checkRelevance(x, y, ml_task="auto", fdr_level=0.05):
'''
Checks the statistical relevance of features to the target data
Expand All @@ -327,7 +327,7 @@ def checkRelevance(x, y, ml_task="auto", fdr_level=0.05):
target = y

# drop id column
features = features.drop(labels="id", axis=1)
features = features.drop(labels=["id",'index'], axis=1, errors ='ignore')

# calculate relevance
relevance_test = crt(features,
Expand All @@ -350,7 +350,7 @@ def checkRelevance2(x, y, ml_task="auto", fdr_level=0.05):
target = y

# drop id column
features = features.drop(labels="id", axis=1,errors='ignore')
features = features.drop(labels=["id",'index'], axis=1,errors='ignore')

# calculate relevance
relevance_test = crt(features,
Expand Down
Loading

0 comments on commit 42a6012

Please sign in to comment.