mask working w index, masking extract trouble

adbedada · Dec 28, 2018 · 42a6012 · 42a6012
1 parent 331d3ff
commit 42a6012
Show file tree

Hide file tree

Showing 3 changed files with 177 additions and 162 deletions.
diff --git a/requirements.txt b/requirements.txt
@@ -4,4 +4,5 @@ numpy>=1.14.0
 scikit-learn>=0.19.1
 pandas>=0.22.0
 geojson>=2.2.0
-rasterio>=1.0.2
+rasterio>=1.0.2
+dask>=1.0.0
diff --git a/tsraster/calculate.py b/tsraster/calculate.py
@@ -12,7 +12,8 @@
 from tsfresh import extract_features
 from tsfresh.utilities.distribution import MultiprocessingDistributor, LocalDaskDistributor
 from tsfresh.feature_selection.relevance import calculate_relevance_table as crt
-from tsraster.prep import image_to_series, image_to_array, read_images, image_to_series2
+from tsraster.prep import image_to_series, image_to_array, read_images
+from tsfresh.utilities.distribution import LocalDaskDistributor
 
 
 def CreateTiff(Name, Array, driver, NDV, GeoT, Proj, DataType, path):
@@ -71,7 +72,7 @@ def calculateFeatures(path, parameters, reset_df, tiff_output=True):
         print('df: '+os.path.join(path,'my_df.csv'))
         my_df.to_csv(os.path.join(path,'my_df.csv'), chunksize=10000, index=False)
 
-    Distributor = MultiprocessingDistributor(n_workers=6,
+    Distributor = MultiprocessingDistributor(n_workers=2,
                                              disable_progressbar=False,
                                              progressbar_title="Feature Extraction")
 
@@ -80,9 +81,11 @@ def calculateFeatures(path, parameters, reset_df, tiff_output=True):
                                           default_fc_parameters=parameters,
                                           column_sort="time",
                                           column_value="value",
-                                          column_id="id",
+                                          column_id="pixel_id",
                                           distributor=Distributor
                                           )
+    # change index name to match 
+    extracted_features.index.rename('pixel_id',inplace=True)
 
     # deal with output location 
     out_path = Path(path).parent.joinpath(Path(path).stem+"_features")
@@ -106,9 +109,6 @@ def calculateFeatures(path, parameters, reset_df, tiff_output=True):
 
     # write out features to tiff file
     if tiff_output == False:
-
-        '''tiff_output is true and by default exports tiff '''
-
         return extracted_features
     else:
         # get image dimension from raw data
@@ -136,132 +136,134 @@ def calculateFeatures(path, parameters, reset_df, tiff_output=True):
         return extracted_features
 
 
-def calculateFeatures2(path, parameters, mask=None, reset_df=True, tiff_output=True, missing_value =-9999,workers=2):
-    '''
-    Calculates features or the statistical characteristics of time-series raster data.
-    It can also save features as a csv file (dataframe) and/or tiff file.
-    
-    :param path: directory path to the raster files
-    :param parameters: a dictionary of features to be extracted
-    :param reset_df: boolean option for existing raster inputs as dataframe
-    :param tiff_output: boolean option for exporting tiff file
-    :return: extracted features as a dataframe and tiff file
-    '''
-
-    if reset_df == False:
-        #if reset_df =F read in csv file holding saved version of my_df
-        df_long = pd.read_csv(os.path.join(path,'df_long.csv'))
-
-        # create example of original df to help unmask 
-        df_original = pd.read_csv(os.path.join(path,'df_original.csv') )
-        df_original = pd.DataFrame(index = pd.RangeIndex(start=0,
-                                                         stop=len(df_original),
-                                                         step=1), 
-                                             dtype=np.float32)
-
-        # set index name to pixel id 
-        df_original.index.names = ['pixel_id']
-
-    else:
-        #if reset_df =T calculate ts_series and save csv
-        df_long, df_original   = image_to_series2(path, 
-                                                  mask)
-
-        print('df: '+os.path.join(path,'df_long.csv'))
-        df_long.to_csv(os.path.join(path,'df_long.csv'), 
-                     chunksize=10000, 
-                     index=False)
-
-        df_original.to_csv(os.path.join(path,'df_original.csv'), 
-                     chunksize=10000, 
-                     index=True)
-
-    # remove missing values from df_long
-    df_long = df_long[df_long['value'] != missing_value]
-
-    # check if the number of observation per pixel are not identical
-    if ~df_long.groupby(['pixel_id','kind']).kind.count().all():
-        print('ERROR: the number of observation per pixel are not identical')
-        print('       fix missing values to have a uniform time series')
-        print(df_long.groupby(['time']).time.unique())
-
-        return(df_long.groupby(['pixel_id','kind']).kind.count().all())
-
-
-    Distributor = MultiprocessingDistributor(n_workers=workers,
-                                             disable_progressbar=False,
-                                             progressbar_title="Feature Extraction")
-    #Distributor = LocalDaskDistributor(n_workers=2)
-
-    extracted_features = extract_features(df_long,
-                                          #chunksize=10e6,
-                                          default_fc_parameters=parameters,
-                                          column_id="pixel_id", 
-                                          column_sort="time", 
-                                          column_kind="kind", 
-                                          column_value="value",
-                                          distributor=Distributor
-                                          )
-
-    # extracted_features.index is == df_long.pixel_id
-    extracted_features.index.name= 'pixel_id'
-
-
-    #unmask extracted features to match df_original index 
-    extracted_features = pd.concat( [df_original, extracted_features], 
-                                            axis=1 )
-
-    # fill missing values with correct 
-    extracted_features.fillna(missing_value, inplace=True)
-
-
-    # deal with output location 
-    out_path = Path(path).parent.joinpath(Path(path).stem+"_features")
-    out_path.mkdir(parents=True, exist_ok=True)
-
-    # write out features to csv file
-    print("features:"+os.path.join(out_path,'extracted_features.csv'))
-    extracted_features.to_csv(os.path.join(out_path,'extracted_features.csv'), chunksize=10000)
-
-    # write data frame
-    kr = pd.DataFrame(list(extracted_features.columns))
-    kr.index += 1
-    kr.index.names = ['band']
-    kr.columns = ['feature_name']
-    kr.to_csv(os.path.join(out_path,"features_names.csv"))
-
-
-    # write out features to tiff file
-    if tiff_output == False:
-
-        '''tiff_output is true and by default exports tiff '''
-
-        return extracted_features  
-
-    else:
-        # get image dimension from raw data
-        rows, cols, num = image_to_array(path).shape
-        # get the total number of features extracted
-        matrix_features = extracted_features.values
-        num_of_layers = matrix_features.shape[1]
-
-        #reshape the dimension of features extracted
-        f2Array = matrix_features.reshape(rows, cols, num_of_layers)
-        output_file = 'extracted_features.tiff'  
-
-        #Get Meta Data from raw data
-        raw_data = read_images(path)
-        GeoTransform = raw_data[0].GetGeoTransform()
-        driver = gdal.GetDriverByName('GTiff')
-
-        noData = -9999
-
-        Projection = raw_data[0].GetProjectionRef()
-        DataType = gdal.GDT_Float32
-
-        #export tiff
-        CreateTiff(output_file, f2Array, driver, noData, GeoTransform, Projection, DataType, path=out_path)
-        return extracted_features
+#def calculateFeatures2(path, parameters, mask=None, reset_df=True, tiff_output=True, 
+#                           missing_value =-9999,workers=2):
+#    '''
+#    Calculates features or the statistical characteristics of time-series raster data.
+#    It can also save features as a csv file (dataframe) and/or tiff file.
+#    
+#    :param path: directory path to the raster files
+#    :param parameters: a dictionary of features to be extracted
+#    :param reset_df: boolean option for existing raster inputs as dataframe
+#    :param tiff_output: boolean option for exporting tiff file
+#    :return: extracted features as a dataframe and tiff file
+#    '''
+#      
+#    if reset_df == False:
+#        #if reset_df =F read in csv file holding saved version of my_df
+#        df_long = pd.read_csv(os.path.join(path,'df_long.csv'))
+#        
+#        # create example of original df to help unmask 
+#        df_original = pd.read_csv(os.path.join(path,'df_original.csv') )
+#        df_original = pd.DataFrame(index = pd.RangeIndex(start=0,
+#                                                         stop=len(df_original),
+#                                                         step=1), 
+#                                             dtype=np.float32)
+#        
+#        # set index name to pixel id 
+#        df_original.index.names = ['pixel_id']
+#        
+#    else:
+#        #if reset_df =T calculate ts_series and save csv
+#        df_long, df_original   = image_to_series2(path, 
+#                                                  mask)
+#        
+#        print('df: '+os.path.join(path,'df_long.csv'))
+#        df_long.to_csv(os.path.join(path,'df_long.csv'), 
+#                     chunksize=10000, 
+#                     index=False)
+#    
+#        df_original.to_csv(os.path.join(path,'df_original.csv'), 
+#                     chunksize=10000, 
+#                     index=True)
+#    
+#    # remove missing values from df_long
+#    df_long = df_long[df_long['value'] != missing_value]
+#    
+#    # check if the number of observation per pixel are not identical
+#    if ~df_long.groupby(['pixel_id','kind']).kind.count().all():
+#        print('ERROR: the number of observation per pixel are not identical')
+#        print('       fix missing values to have a uniform time series')
+#        print(df_long.groupby(['time']).time.unique())
+#        
+#        return(df_long.groupby(['pixel_id','kind']).kind.count().all())
+#     
+#        
+#    Distributor = MultiprocessingDistributor(n_workers=workers,
+#                                             disable_progressbar=False,
+#                                             progressbar_title="Feature Extraction")
+#    #Distributor = LocalDaskDistributor(n_workers=2)
+#    
+#    extracted_features = extract_features(df_long,
+#                                          #chunksize=10e6,
+#                                          default_fc_parameters=parameters,
+#                                          column_id="pixel_id", 
+#                                          column_sort="time", 
+#                                          column_kind="kind", 
+#                                          column_value="value",
+#                                          distributor=Distributor
+#                                          )
+#    
+#    # extracted_features.index is == df_long.pixel_id
+#    extracted_features.index.name= 'pixel_id'
+#    
+#    
+#    #unmask extracted features to match df_original index 
+#    extracted_features = pd.concat( [df_original, extracted_features], 
+#                                            axis=1 )
+#    
+#    # fill missing values with correct 
+#    extracted_features.fillna(missing_value, inplace=True)
+#    
+#    
+#    # deal with output location 
+#    out_path = Path(path).parent.joinpath(Path(path).stem+"_features")
+#    out_path.mkdir(parents=True, exist_ok=True)
+#     
+#    # write out features to csv file
+#    print("features:"+os.path.join(out_path,'extracted_features.csv'))
+#    extracted_features.to_csv(os.path.join(out_path,'extracted_features.csv'), chunksize=10000)
+#    
+#    # write data frame
+#    kr = pd.DataFrame(list(extracted_features.columns))
+#    kr.index += 1
+#    kr.index.names = ['band']
+#    kr.columns = ['feature_name']
+#    kr.to_csv(os.path.join(out_path,"features_names.csv"))
+#    
+#    
+#    # write out features to tiff file
+#    if tiff_output == False:
+#    
+#        '''tiff_output is true and by default exports tiff '''
+#    
+#        return extracted_features  
+#    
+#    else:
+#         print('use export_features instead')
+#        # get image dimension from raw data
+#        rows, cols, num = image_to_array(path).shape
+#        # get the total number of features extracted
+#        matrix_features = extracted_features.values
+#        num_of_layers = matrix_features.shape[1]
+#        
+#        #reshape the dimension of features extracted
+#        f2Array = matrix_features.reshape(rows, cols, num_of_layers)
+#        output_file = 'extracted_features.tiff'  
+#        
+#        #Get Meta Data from raw data
+#        raw_data = read_images(path)
+#        GeoTransform = raw_data[0].GetGeoTransform()
+#        driver = gdal.GetDriverByName('GTiff')
+#        
+#        noData = -9999
+#        
+#        Projection = raw_data[0].GetProjectionRef()
+#        DataType = gdal.GDT_Float32
+#        
+#        #export tiff
+#        CreateTiff(output_file, f2Array, driver, noData, GeoTransform, Projection, DataType, path=out_path)
+#        return extracted_features
 
 
 def features_to_array(path, input_file):
@@ -311,8 +313,6 @@ def exportFeatures(path, input_file, output_file,
    return export_features
 
 
-
-
 def checkRelevance(x, y, ml_task="auto", fdr_level=0.05):
     '''
     Checks the statistical relevance of features to the target data
@@ -327,7 +327,7 @@ def checkRelevance(x, y, ml_task="auto", fdr_level=0.05):
     target = y
 
     # drop id column
-    features = features.drop(labels="id", axis=1)
+    features = features.drop(labels=["id",'index'], axis=1, errors ='ignore')
 
     # calculate relevance
     relevance_test = crt(features,
@@ -350,7 +350,7 @@ def checkRelevance2(x, y, ml_task="auto", fdr_level=0.05):
         target = y
 
         # drop id column
-        features = features.drop(labels="id", axis=1,errors='ignore')
+        features = features.drop(labels=["id",'index'], axis=1,errors='ignore')
 
         # calculate relevance
         relevance_test = crt(features,