lidar_labeler.preprocessing

A collection of functions that facilitating preprocesing a database built with the label builder for training a convolutional neural network algorithm Run with llb venv

  1"""
  2A collection of functions that facilitating preprocesing a database built with the label builder for training a convolutional neural network algorithm
  3Run with llb venv
  4"""
  5#import packages
  6import sys
  7import os
  8# scriptDir = os.path.dirname(os.path.abspath(__file__))
  9# parentDir = os.path.dirname(scriptDir)
 10# sys.path.append(parentDir)
 11# from lidar_labeler import labeler_tools as lt
 12from lidar_labeler import labeler_tools as lt
 13
 14import geopandas as gpd
 15import pandas as pd
 16import numpy as np
 17import json
 18from PIL import Image
 19from osgeo import gdal
 20from scipy.ndimage import gaussian_filter
 21# from skimage.transform import rotate
 22gdal.UseExceptions()
 23from pathlib import Path
 24
 25# global_vars = os.path.join(parentDir, 'configs', 'global_variables.json')
 26
 27# with open(global_vars, 'r') as f:
 28#     params_dict = json.load(f)
 29
 30with (Path(__file__).resolve().parent.parent / 'configs' / 'global_variables.json').open('r') as f:
 31    params_dict = json.load(f)
 32
 33# Access the variables
 34CLIPPED_ROI_FOLDER = params_dict['CLIPPED_ROI_FOLDER']
 35IMG_FOLDER = params_dict['IMG_FOLDER']
 36ROIDF_RSTR_COL_PATTERN = params_dict['ROIDF_RSTR_COL_PATTERN']
 37ROIDF_IMG_COL_PATTERN = params_dict['ROIDF_IMG_COL_PATTERN']
 38
 39LABEL_FILE_PATTERN = params_dict['LABEL_FILE_PATTERN']
 40LABEL_FILE_EXT = params_dict['LABEL_FILE_EXT']
 41Y_COL_LABEL = params_dict['Y_COL_LABEL']
 42X_COL_LABEL = params_dict['X_COL_LABEL']
 43
 44
 45def load_df(filePath:str, checkFilePaths:bool=False):
 46    """Loads df from filepaths. Checks df to see if filepaths within the df exist and attempts recovery within the database if not.
 47
 48    Args:
 49        filePath (str): Path to geodatabase where clipped rasters and labels are stored.
 50        checkFilePaths (bool): Whether to check if filepaths within the df exist. Default to False.
 51    
 52    Returns:
 53        df (gpd.GeoDataFrame): A GeoDataFrame loaded from the filepath containing ROI polygons and file paths to associated clipped DEMs.
 54    """
 55    df = gpd.read_file(filePath, truncation=False)
 56    recoveryAttempted = False  # Flag to track whether recovery attempt has been made
 57    recoverySuccessful = False
 58        
 59    # Get raster and image column names
 60    rasterCols = [col for col in df.columns if col.startswith(ROIDF_RSTR_COL_PATTERN)]
 61    imageCols = [col for col in df.columns if col == ROIDF_IMG_COL_PATTERN]
 62    
 63
 64    if checkFilePaths:
 65        #The following loops through the rows and raster columns and checks if raster/image paths are valid. If not, will attempt to find filepaths within the
 66        #df directory following the directory structure of the lidar label builder.
 67        for index, row in df.iterrows():
 68            for col in rasterCols:
 69                path = row[col]
 70                if not os.path.exists(path):
 71                    if not recoveryAttempted:
 72                        print(f'Unable to locate some filepaths. Attempting to recover filepaths within the specified directory.')
 73                        recoveryAttempted = True
 74                    directory = os.path.split(filePath)[0]
 75                    fname = os.path.split(path)[1]
 76                    tryPath = os.path.join(directory, CLIPPED_ROI_FOLDER, fname)
 77                    if os.path.exists(tryPath):
 78                        df.at[index, col] = tryPath
 79                        recoverySuccessful=True
 80                        
 81                    else:
 82                        raise ValueError(f"Unable to locate necessary filepaths.\nAttempted Path 1: {path}\nAttempted Path 2: {tryPath}")
 83            for col in imageCols:
 84                path = row[col]
 85                if not os.path.exists(path):
 86                    if not recoveryAttempted:
 87                        print(f'Unable to locate some filepaths. Attempting to recover filepaths within the specified directory.')
 88                        recoveryAttempted = True
 89                    directory = os.path.split(filePath)[0]
 90                    fname = os.path.split(path)[1]
 91                    tryPath = os.path.join(directory, IMG_FOLDER, fname)
 92                    if os.path.exists(tryPath):
 93                        df.at[index, col] = tryPath
 94                        recoverySuccessful = True
 95                        
 96                    else:
 97                        raise ValueError(f"Unable to locate necessary filepaths.\nAttempted Path 1: {path}\nAttempted Path 2: {tryPath}")
 98        
 99        #if files were recovered saves a new df file with the recovered raster/image filepaths
100        if recoverySuccessful:
101            basePath, ext = os.path.splitext(filePath)
102            savePath = f'{basePath}_recovered{ext}'
103            df.to_file(savePath)
104    
105    return df
106
107def filter_df(df:gpd.GeoDataFrame, dataColumn:str, labelColumn:str, labelsToRemove=None):
108    """Filter a GeoDataFrame by keeping only specified data and label columns and removing entries based on conditions.
109
110    Args:
111        df (geopandas.GeoDataFrame): Input GeoDataFrame containing data and label columns.
112        dataColumn (str): A string matching the column name for the column containing filepaths to dem grids or images.
113        labelColumn (str): Name of the column containing labels.A string matching the column name for the label column containing labels associated with your data column.
114        labelsToRemove (str or list, optional): Label or list of labels to be removed from the GeoDataFrame. Defaults to None. 
115
116    Returns:
117        filteredDf (geopandas.GeoDataFrame): Filtered GeoDataFrame with only specified data and label columns,
118        and entries removed based on specified conditions.
119    """
120    #Remove all columns that are not data and label columns
121    colsToKeep = [dataColumn, labelColumn, 'geometry']
122    df = df[colsToKeep]
123
124    #remove entries that have no data or that are in the list labelsToRemove
125    df = df.dropna(subset=[labelColumn])
126    if labelsToRemove:
127        if isinstance(labelsToRemove, str):
128            labelsToRemove = [labelsToRemove]
129        filteredDf = df[~df[labelColumn].isin(labelsToRemove)]
130    else:
131        filteredDf = df.copy()
132    return filteredDf
133
134def import_filter_concat_df(path, dataColumn:str, labelColumn:str, labelsToRemove=None):
135    """
136    Import and filter GeoDataFrames from shapefiles or directories of shapefiles. If multiple shapefiles will concatenate into one single filtered df.
137
138    Args:
139        path (str or list): File path to a shapefile, directory of shapeiles, or a list of file paths to shapefiles. All shapefiles must contain a dataColumn and a labelColumn.
140        dataColumn (str): A string matching the column name for the column containing filepaths to dem grids or images.
141        labelColumn (str): A string matching the column name for the label column containing labels associated with your data column.
142        labelsToRemove (str or list, optional): Label or list of labels to be removed from the GeoDataFrame. Defaults to None. 
143
144    Returns:
145        filteredDf (gpd.GeoDataFrame): A compiled geopandas GeoDataFrame with the columns associated with data, labels, and geometry with any columns containng labels matching a value in the 
146        labelsToRemove argument removed from the dataframe.
147
148    Raises:
149        ValueError: If the input 'path' is of an unsupported type or if the file type or directory structure is not valid.
150    """
151    #if the input path is a string (a path to one location)
152    if isinstance(path, str):
153        if os.path.isfile(path) and LABEL_FILE_PATTERN in path and path.lower().endswith(LABEL_FILE_EXT): #If this string is a file (not a directory) and matches the label file convention specified in global variables, then load that single df
154            df = load_df(path)
155            filteredDf = filter_df(df, dataColumn, labelColumn, labelsToRemove=labelsToRemove) #drop any labels that are na or that match a label in labelsToRemove
156        elif os.path.isdir(path): #If the path is to a directory, load each individual matching the label file pattern defined in global variables and concatenate into one df
157            dfs=[]
158            labelDfs = []
159            for root, dirs, files in os.walk(path):
160                for file in files:
161                    if LABEL_FILE_PATTERN in file and file.lower().endswith(LABEL_FILE_EXT):
162                        filePath=os.path.join(root, file)
163                        labelDfs.append(filePath)
164                        df = load_df(filePath)
165                        filteredDf = filter_df(df, dataColumn, labelColumn, labelsToRemove=labelsToRemove) #drop any labels that are na or that match a label in labelsToRemove
166                        dfs.append(filteredDf)
167            if len(dfs) > 1:
168                print('Joining Dataframes: ' + str(labelDfs))
169                filteredDf = pd.concat(dfs, ignore_index=True) #join into one df
170            else:
171                filteredDf = dfs[0] #if just one df in the directory extract that single directory from the dfs list.
172
173        else:
174            raise ValueError(f"Unsupported file type or directory structure for path: {path}")
175    elif isinstance(path, list): #If given a list of filepaths to label files, load filter and join these into one
176        dfs = []
177        for p in path:
178            if os.path.isfile(p) and p.lower().endswith('.shp'):
179                df = load_df(p)
180                filteredDf = filter_df(df, dataColumn, labelColumn, labelsToRemove=labelsToRemove)
181                dfs.append(filteredDf)
182            elif os.path.isdir(path):
183                dfs=[]
184                labelDfs = []
185                for root, dirs, files in os.walk(path):
186                    for file in files:
187                        if LABEL_FILE_PATTERN in file and file.lower().endswith(LABEL_FILE_EXT):
188                            filePath=os.path.join(root, file)
189                            labelDfs.append(filePath)
190                            df = load_df(filePath)
191                            filteredDf = filter_df(df, dataColumn, labelColumn, labelsToRemove=labelsToRemove) #drop any labels that are na or that match a label in labelsToRemove 
192                            dfs.append(filteredDf)
193                print('Joining Dataframes: ' + str(labelDfs))
194                filteredDf = pd.concat(dfs, ignore_index=True) #join into one
195            else:
196                print(f"Unsupported input type for {p}. Extension must be {LABEL_FILE_EXT}.")
197              
198        filteredDf = pd.concat(dfs, ignore_index=True)
199    else:
200        raise ValueError(f"Unsupported input type for {path}. Must be a string or a list of strings.")
201    
202    return filteredDf
203
204def make_lookup(labelLookupPath:str, compiledDf:gpd.GeoDataFrame, labelColumn:str):
205    """Creates a lookup dictionary that maps integer values to unique string labels in the label column. Note: this is done in order of appearance for the label. If you 
206    wish to have specific labels mapped to specific integer values, it is best to make this lookup dictionary manually.
207
208    Args:
209        labelLookupPath (str): The path to a lookup dictionary that maps integer values to string labels. Alternatively this function can make a lookup at this path if one 
210        does not currently exist. 
211        compiledDf (gpd.GeoDataFrame): The compiled DataFrame containig all training data. This DataFrame should have a column with labels that you want to map to integer values.
212        labelColumn (str): A string matching the column name for the label column containing labels associated with your data column
213    """
214    uniqueLabels = compiledDf[labelColumn].unique() #locates all unique label options in df
215    labelDict = {k:v for k,v in enumerate(uniqueLabels)} #makes a dictionary
216    with open(F"{labelLookupPath}", 'w') as f:
217        json.dump(labelDict, f) #saves to filepath
218
219def get_raster_as_grid(raster):
220    """Loads in a digital elevation model as a numpy array and converts any no-data to np.nan
221
222        Args:
223            raster (str OR gdal.Dataset): The path to a single band, gdal readable, digital elevation model OR
224                an already loaded raster dataset
225
226        Returns:
227            rasterGrid (numpy.ndarray): The elevation data stored as a grid
228            dx (float): The x coordinate spacing of the grid
229            dy (float): The y coordinate spacing of the grid
230
231        Raises:
232            Exception: Input is neither a path to a raster or a gdal.Dataset
233        """
234    
235    #If this is a raster dataset
236    if isinstance(raster,gdal.Dataset):
237        doClose = False
238    
239    #If this is a file
240    elif os.path.isfile(raster):   
241        #Get the raster grid
242        raster = gdal.Open(raster)
243        
244
245        #Close this file after the operation completes
246        doClose = True
247    else:
248         Exception('Specified raster is neither a path to a raster or a gdal.Dataset. Please specify a valid path.')
249
250    rasterGrid = raster.ReadAsArray().astype(float)
251    NDV = raster.GetRasterBand(1).GetNoDataValue()
252
253    #Mask out NDVs as nan
254    rasterGrid[rasterGrid==NDV] = np.nan
255
256    # Grab the basic header information (xUL, dx, rot1, yUL, rot2, dy)
257    geotransform = raster.GetGeoTransform()  
258    
259    dx = geotransform[1]
260    dy = geotransform[-1]
261
262    if doClose:
263        raster = None #Close the raster
264
265    return rasterGrid, dx, dy
266
267def load_rasters(df:gpd.GeoDataFrame, dataColumn:str, roiWidth:int, dataType:str = 'dem'):
268    """Loads the rasters depending on the datatype of the data column. Can either be a digital elevation model, or an image (1 or 3 band)
269
270    Args:
271        df (gpd.GeoDataFrame): geopandas GeoDataFrame containing a datacolumn containing paths to digital elevation models or images.
272        dataColumn (str): A string matching the column name for the column containing filepaths to dem grids or images.
273        roiWidth (int): The width of the raster grids. 
274        dataType (str, optional): String describing the type of data in the data column. Options: 'dem', 'image1' or 'image3'. Defaults to 'dem'.
275
276    Returns:
277        dfLoadedRasters (gpd.GeoDataFrame): A geopandas GeoDataFrame with a new column containing the loaded raster data.
278    """
279    dataPaths = df[dataColumn].tolist()
280    grids = []
281
282    for path in dataPaths:
283        try:
284            if dataType == 'dem':
285                grid = get_raster_as_grid(path)[0]
286
287            elif dataType == 'image1':
288                grid = np.asarray(Image.open(path).convert('L'))
289
290            elif dataType == 'image3':
291                grid = np.asarray(Image.open(path).convert('RGB'))
292                
293            else:
294                raise(ValueError('Incorrect data type specified. Options: raster or image'))
295        except Exception as e:
296            print(f'Error processing {path}: {e}. A Nan grid will be used in its place. ')
297            grid = np.full((roiWidth, roiWidth), np.nan)
298        grids.append(grid)
299    
300    df[X_COL_LABEL] = grids #place loaded grids into the X column with a label specified in global variables.
301
302    dfLoadedRasters = df.copy()
303
304    return dfLoadedRasters
305
306def rm_invalid_raster_rows(dfLoadedRasters, roiWidth:int, rasterCol:str=None, idx:int=None):
307    """Removes any rows with nan, infinite, or negative values or having a standard deviation of 0 in a dataframe with a column of loaded raster arrays.
308
309    Args:
310        dfLoadedRasters (pd.DataFrame or gpd.GeoDataFrame): Dataframe or GeoDataFrame containing raster data loded as arrays.
311        roiWidth (int): The width of the raster grids. Any rasters with dimensions that are not roiWidth x roiWidth will be removed.
312        rasterCol (str): The label for the column containing raster arrays
313        idx (int): If using this function to preprocess the rasters for a gridded database that is stored in a larger database, the index value
314        can be specified to be included in printout. Defaults to None.
315    Returns:
316        dfNanRm (pd.DataFrame or gpd.GeoDataFrame): Updated DataFrame after handling NaN values.
317    """
318    if not rasterCol:
319        rasterCol = X_COL_LABEL
320
321    dfNanRm = dfLoadedRasters.copy() 
322
323    # Initialize counters for each condition
324    countNan = 0
325    countInf = 0
326    countNegative = 0
327    countZeroStdv = 0
328    countInvalidShape = 0
329
330    rowsToDrop = []
331    for i, row in dfLoadedRasters.iterrows():
332        dataArray = row[rasterCol]
333        containsNan = np.any(np.isnan(dataArray))
334        containsInf = np.any(np.isinf(dataArray))
335        stdv = np.nanstd(dataArray)
336        containsNegative = np.any(dataArray < 0)
337        invalidShape = dataArray.shape != (roiWidth, roiWidth)
338        # Check each condition and update counters
339        if containsNan:
340            countNan += 1
341        if containsInf:
342            countInf += 1
343        if containsNegative:
344            countNegative += 1
345        if stdv == 0:
346            countZeroStdv += 1
347        if invalidShape:
348            countInvalidShape += 1
349
350
351        if containsNan or containsInf or containsNegative or stdv == 0 or invalidShape:
352            rowsToDrop.append(i)
353        
354    #Drop Rows with Nan vals or 0 standard deviation
355    dfNanRm.drop(rowsToDrop, inplace=True)
356    # Reset index after removing rows
357    dfNanRm = dfNanRm.reset_index(drop=True)
358
359    if len(rowsToDrop) > 0:
360        if idx:
361            print(f"The following rows were dropped for idx {idx} the respective reasons:")
362        else:
363            print("The following rows were dropped for the respective reasons:")
364        if countNan > 0:
365            print(f" - Containing NaN values: {countNan}")
366        if countInf > 0:
367            print(f" - Containing infinite values: {countInf}")
368        if countNegative > 0:
369            print(f" - Containing negative values: {countNegative}")
370        if countZeroStdv > 0:
371            print(f" - Having a standard deviation of 0: {countZeroStdv}")
372        if countInvalidShape > 0:
373            print(f" - Not having the shape ({roiWidth}, {roiWidth}): {countInvalidShape}")
374    return dfNanRm
375
376def make_even_distribution(df:gpd.GeoDataFrame, doPrint:bool=True):
377    """Adjusts the distribution of labels in a GeoDataFrame to be even by removing rows from any label that doesn't have the smallest number of labels.
378
379    Args:
380        df (gpd.GeoDataFrame): The GeoDataFrame containing the data. This DataFrame should have a column with labels that you want to distribute evenly.
381        doPrint (bool, optional): If True, prints information about the labels being removed to achieve even distribution. Defaults to True.
382
383    Returns:
384        evenlyDistributedDf (gpd.GeoDataFrame): A new GeoDataFrame with an adjusted label distribution.
385    """
386    
387    uniqueLabs = df[Y_COL_LABEL].unique()
388    minCount = df[Y_COL_LABEL].value_counts().min()
389
390    # Calculate the number of rows to remove for each label
391    excessCount = df[Y_COL_LABEL].value_counts() - minCount
392    excessCount = excessCount[excessCount > 0]
393    
394    for label, count in excessCount.items():
395        labelIndices = df[df[Y_COL_LABEL] == label].index
396        numToRemove = int(count)
397        randomIndices = np.random.choice(labelIndices, size=numToRemove, replace=False)
398        df = df.drop(randomIndices)
399        
400        if doPrint:
401            print(f'Removing {numToRemove} labels with label {label} to achieve the desired distribution.')
402
403    return df
404
405
406def subset_df(df:gpd.GeoDataFrame, subsetSize:int, evenlyDistributeDf:bool = False, doPrint:bool = True):
407    """This function creates a subset of the input GeoDataFrame, optionally ensuring that the distribution of labels is even before subsetting.
408
409    Args:
410        df (gpd.GeoDataFrame): The input GeoDataFrame containing the data. This DataFrame should have a column with labels that you want to subset.
411        subsetSize (int): The desired size of the subset.
412        evenlyDistributeDf (bool, optional): If True, adjusts the DataFrame to have an even distribution of labels before subsetting. Defaults to False.
413        doPrint (bool, optional): If True, prints information about the subsetting process, including the number of samples kept for each label. Defaults to True.
414
415    Returns:
416        gpd.GeoDataFrame: A new GeoDataFrame that is a subset of the input DataFrame with the specified size and optional even label distribution.
417    """
418    
419    uniqueLabs = df[Y_COL_LABEL].unique()
420    labelCounts = df[Y_COL_LABEL].value_counts(normalize=True)
421    currentDistribution = pd.Series(labelCounts.values, index=labelCounts.index).sort_index()
422
423
424    samplesToKeep = (currentDistribution*subsetSize).astype(int)
425    indicesToKeep = []
426
427    if evenlyDistributeDf:
428        dfToSubset = make_even_distribution(df, doPrint=False)
429    else:
430        dfToSubset = df.copy()
431
432    for label in uniqueLabs:
433        labelIndices = dfToSubset[dfToSubset[Y_COL_LABEL] == label].index.tolist()
434        subset = np.random.choice(labelIndices, samplesToKeep[label], replace = False)
435        indicesToKeep.extend(subset)
436        if doPrint:
437            print(f'Keeping {len(subset)} of {len(labelIndices)} for label: {label} to obtain a dataframe of size: {sum(samplesToKeep)}')
438            print(f'Distribution for label {label}: {round(currentDistribution[label], 2)}')
439
440    subsetDf = dfToSubset.loc[indicesToKeep]
441
442    return subsetDf
443
444def process_grids(df:gpd.GeoDataFrame, scalingMethod:str='min_max', doApplyGaussianFilter:bool = False, sigma:float=None):
445    """ Iterates through the rows of a df and processed the dem grids to specifications. This function will normalize and rescale the 
446    raster grids based on the specified method (min-max or z-score). It also has the option to apply a gaussian filter and augment the
447    grids by flipping up and down and rotating +90 and -90 degrees. 
448
449    Args:
450        df (gpd.GeoDataFrame): The input dataframe containing loaded raster grids to process.
451        scalingMethod (str, optional): The normalization and rescaling method.
452        - 'min_max': Min-max scaling method. Will result in a grid with values ranging from 0 to 1.
453        - 'z_score': Z-score scaling method. Will transform grids into a standard normal distribution with a mean of 0 and a standard deviation of 1
454        (Default: 'z_score_scaling')
455        doApplyGaussianFilter: Specifies if a gaussian filter will be applied to each grid. Defaults to False.
456        sigma (float, optional): Sigma value for gaussian filter. Defaults to None. If not specified but gaussian filter argument set to true will default to 1.
457        dataframe. Augmentation includes flipping each grid up and down and rotating each grid +90 and -90 degrees. The original row from of the input
458        is copied, corresponding geometry is removed and these augmented rows are joined with the input dataframe. 
459
460    Returns:
461        processedDf (gpd.GeoDataFrame): The processed Df with normalized, rescaled, filtered, and augmented grids.
462    """
463    processedDf = df.copy()
464
465    for i, row in processedDf.iterrows():
466        grid = row[X_COL_LABEL]
467
468        #Apply a gaussian filter to slightly blur/smooth dem
469        if doApplyGaussianFilter:
470            if not sigma:
471                print("No sigma specified for gaussian filter. Will default to sigma = 1")
472                sigma = 1
473            gridToRescale = gaussian_filter(grid, sigma = sigma, mode = 'nearest')
474        else:
475            gridToRescale = grid
476
477
478        #Rescales DEMS by either min max) or zscore
479        if scalingMethod == 'min_max':
480            minVal = np.nanmin(gridToRescale)
481            maxVal = np.nanmax(gridToRescale)
482            normGrid = (gridToRescale-minVal)/(maxVal-minVal)
483        elif scalingMethod == 'z_score':
484            mean_value = np.nanmean(gridToRescale)
485            std_deviation = np.nanstd(gridToRescale)
486            normGrid = (gridToRescale - mean_value) / std_deviation
487        else:
488            raise(Warning("No scaling method specified. Grids will not be scaled or normalized."))
489        
490        processedDf.at[i, X_COL_LABEL] = normGrid
491
492    return processedDf
493
494def shuffle_df(df:gpd.GeoDataFrame):
495    """Shuffles the rows of a DataFrame.
496
497    Args:
498        df (gpd.GeoDataFrame): The input GeoDataFrame to be shuffled.
499
500    Returns:
501        shuffledDf (gpd.GeoDataFrame): A new GeoDataFrame with shuffled rows.
502    """
503    shuffleIndices = np.random.permutation(len(df))
504    shuffledDf = df.iloc[shuffleIndices].reset_index(drop=True)
505    return shuffledDf
506
507def make_arrays(df:gpd.GeoDataFrame, dataType:str = 'dem', dataCol:str = None, labelCol:str = None):
508    """Converts columns of a GeoDataFrame into data and label arrays for use in machine learning models.
509
510    Args:
511        df (gpd.GeoDataFrame): The input GeoDataFrame containing the data and labels.
512        dataType (str, optional): The type of data, either 'dem' or 'image1' for single band data or 'image3' for three-band data. Defaults to 'dem'.
513        dataCol (str, optional): The column name for the data. Defaults to None, in which case it uses the global variable X_COL_LABEL if available.
514        labelCol (str, optional): The column name for the labels. Defaults to None, in which case it uses the global variable Y_COL_LABEL if available.
515
516    Returns:
517        tuple: A tuple containing:
518            - dataArray (np.ndarray or None): The array of data, with shape (num_samples, nrows, ncols, nbands).
519            - labelArray (np.ndarray or None): The array of labels, with shape (num_samples, 1).
520    """
521
522    if df.empty:
523        return None, None
524
525    if not dataCol and X_COL_LABEL in df.columns:
526        dataCol = X_COL_LABEL
527    if not labelCol and Y_COL_LABEL in df.columns:
528        labelCol = Y_COL_LABEL
529    # Initialize arrays to None
530    dataArray = None
531    labelArray = None
532
533    if dataCol:
534        nrows = df[dataCol].iloc[0].shape[0]
535        ncols = df[dataCol].iloc[0].shape[1]
536
537        if dataType == 'image3':
538            nbands = 3
539
540        elif dataType == 'image1' or 'dem':
541            nbands = 1
542        else:
543            raise ValueError("Incorrect datatype specified for dataType. Must be either 'dem', 'image3' or 'image1'.")
544
545        dataArray = np.zeros((len(df), nrows, ncols,nbands))
546
547        for i in range(len(df)):
548            dataArray[i,:,:,:] = df[dataCol].iloc[i].reshape(nrows,ncols,nbands)
549    if labelCol:
550        labelArray = df[labelCol].values.reshape((len(df), 1)).astype(int)
551
552    return dataArray, labelArray
553
554def preprocess_df(path:str, dataColumn:str, roiWidth:int, labelColumn:str, labelLookupPath:str, label:str, saveDirectory:str, 
555                  dataType:str = 'dem', labelsToRemove = None, scalingMethod:str = 'z_score', 
556                  doApplyGaussianFilter:bool = True, sigma:float = 0.5,
557                  evenlyDistributeDf:bool = False, subset:int = None
558                  ):
559    """Preprocesses a GeoDataFrame by importing, filtering, joining, and transforming data columns and labels for training a convolutional neural network.
560
561    Required Args:
562        path (str or list): File path to a shapefile, directory of shapefiles, or a list of file paths to shapefiles. All shapefiles must contain a dataColumn and a labelColumn.
563        dataColumn (str): A string matching the column name for the column containing filepaths to dem grids or images.
564        roiWidth (int): The width of the raster grids. Any rasters with dimensions that are not roiWidth x roiWidth will be removed.
565        labelColumn (str): A string matching the column name for the label column containing labels associated with your data column.
566        labelLookupPath (str): The path to a lookup dictionary that maps integer values to string labels. Alternatively this function can make a lookup at this path if one 
567        does not currently exist. 
568        label (str): The label used to uniquely identify the output files.
569        saveDirectory (str): The path to the desired output directory.
570    Optional Args:
571        dataType (str, optional): String describing the type of data in the data column. Options: 'dem', 'image1' or 'image3'. Defaults to 'dem'.
572        labelsToRemove (str or list): Label or list of labels to be removed from the GeoDataFrame. Defaults to None. 
573        scalingMethod (str, optional): The normalization and rescaling method.
574        - 'min_max': Min-max scaling method. Will result in a grid with values ranging from 0 to 1.
575        - 'z_score': Z-score scaling method. Will transform grids into a standard normal distribution with a mean of 0 and a standard deviation of 1
576        (Default: 'z_score_scaling')
577        doApplyGaussianFilter (bool, optional): Specifies if a gaussian filter should be applied to grids. This is essentially a bluring/averaging filter that can help 
578        remove irregularities/artificats in data. Defaults to True.
579        sigma (float, optional): The sigma value used for the gaussian filter. Defaults to 0.5.
580        evenlyDistributeDf (bool, optional): When true, will remove some columns from the dataframe to create an even distribution of labels. Defaults to False.
581        subset (int, optional): When a number is specified, columns will be removed randomly to produce a dataframe of this size. 
582    Returns:
583        tuple: A tuple containing:
584            - dataArray (np.ndarray): The processed data array.
585            - labelArray (np.ndarray): The processed label array.
586            - outDf (gpd.GeoDataFrame): The processed GeoDataFrame with the original raster paths, labels and geometry with an additional column containing integer mappings.
587    """
588    savebaseName = os.path.join(saveDirectory, '{}_{{}}.npy'.format(label))
589   
590    dataArrayPath = savebaseName.format('data')
591    labelArrayPath = savebaseName.format('labels')
592
593    dfOutPath = os.path.join(saveDirectory, f'{label}_df.shp')
594
595    overwrite = True
596
597    if os.path.exists(dataArrayPath) and os.path.exists(labelArrayPath) and os.path.exists(dfOutPath):
598        qStr = "Existing files for data and label arrays detected. Proceed with preprocessing and overwrite existing files (y/n)?"
599        overwrite = lt._request_yn_input(qStr)
600
601    if overwrite:
602        print(f'Importing, filtering, and joining into one dataframe...')
603        compiledDf = import_filter_concat_df(path, dataColumn, labelColumn, labelsToRemove=labelsToRemove)
604        
605        #connect to dictionary with labels and int vals or make one
606        if not os.path.exists(labelLookupPath):
607            qstr = "No label lookup exists at this filepath. Would you like to make one (y/n)."
608            makeLookup = lt._request_yn_input(qstr)
609            if makeLookup:
610                make_lookup(labelLookupPath, compiledDf, labelColumn)
611            else:
612                raise ValueError('No label lookup exists at filepath. Please make a lookup! :~)')
613        with open(labelLookupPath, 'r') as f:
614            labelDict = json.load(f)
615
616        #Make a column for int labels
617        labelDictRev = {labelDict[key]:key for key in labelDict}
618        compiledDf[Y_COL_LABEL] = compiledDf[labelColumn].apply(lambda val: labelDictRev[val])
619
620        if subset:
621            print(f"Subsetting the dataframe to subset size: {subset}")
622            dfToLoad = subset_df(compiledDf, subset, evenlyDistributeDf=evenlyDistributeDf, doPrint=False)
623        else:
624            dfToLoad = compiledDf
625
626        #load rasters and rescale
627        print('Loading rasters...')
628        dfLoadedRasters = load_rasters(dfToLoad, dataColumn, roiWidth, dataType= dataType)
629        
630        print('Removing entries with Nan vals...')
631        # Handle nan vals in arrays
632        dfCleaned = rm_invalid_raster_rows(dfLoadedRasters, roiWidth)
633
634        #option to remove some labels to make an even distribution
635        if evenlyDistributeDf:
636            
637            distributedDf = make_even_distribution(dfCleaned)
638        else:
639            distributedDf = dfCleaned
640
641        if subset:
642            subsetDf = subset_df(distributedDf, subset)
643        else:
644            subsetDf = distributedDf
645
646        if not scalingMethod and not doApplyGaussianFilter:
647            processedDf = subsetDf
648        else:
649            print(f'Processing grids with:\nscaling method: {scalingMethod}\ngaussian filter: {doApplyGaussianFilter}, sigma: {sigma}')
650            processedDf = process_grids(subsetDf, scalingMethod=scalingMethod, doApplyGaussianFilter=doApplyGaussianFilter, sigma=sigma)
651
652        #Shuffles Df
653        shuffledDf = shuffle_df(processedDf)
654
655        print('Converting data and labels to arrays...')
656        dataArray, labelArray = make_arrays(shuffledDf, dataType = dataType)
657        
658        if dataArray is not None and labelArray is not None:
659            #Save Data and Label Arrays
660            np.save(dataArrayPath, dataArray)
661            np.save(labelArrayPath, labelArray)
662
663            outDf = shuffledDf.drop(columns=[X_COL_LABEL])
664            
665            outDf.to_file(dfOutPath)
666       
667            #Print Paths
668            print(f'Data Array Path: {dataArrayPath}\nLabel Array Path: {labelArrayPath}\nDataframe Path: {dfOutPath}')
669        else:
670            print("All df rows removed during preprocessing due to invalid raster data. No arrays or df's saved.")
671            return None, None, None
672    
673    else:
674        #Load df from existing file
675        print(f'Loading from preexisting files.\nData Array Path: {dataArrayPath}\nLabel Array Path: {labelArrayPath}\nDf Path: {dfOutPath}')
676        dataArray = np.load(dataArrayPath, allow_pickle=True)
677        labelArray = np.load(labelArrayPath, allow_pickle=True)
678
679        outDf = gpd.read_file(dfOutPath)
680
681    return dataArray, labelArray, outDf
682
683
684
685if __name__ == '__main__':
686    import sys
687    import json
688
689    # Load parameters from the JSON file
690    params = sys.argv[1]
691    with open(params, 'r') as f:
692        params_dict = json.load(f)
693
694    dataArray, labelArray, outDf = preprocess_df(**params_dict)
695    
CLIPPED_ROI_FOLDER = 'clippedRois'
IMG_FOLDER = 'imgs'
ROIDF_RSTR_COL_PATTERN = 'rstr_clp0'
ROIDF_IMG_COL_PATTERN = 'imgPaths'
LABEL_FILE_PATTERN = '_labels'
LABEL_FILE_EXT = '.shp'
Y_COL_LABEL = 'intLabel'
X_COL_LABEL = 'rasterData'
def load_df(filePath: str, checkFilePaths: bool = False):
 46def load_df(filePath:str, checkFilePaths:bool=False):
 47    """Loads df from filepaths. Checks df to see if filepaths within the df exist and attempts recovery within the database if not.
 48
 49    Args:
 50        filePath (str): Path to geodatabase where clipped rasters and labels are stored.
 51        checkFilePaths (bool): Whether to check if filepaths within the df exist. Default to False.
 52    
 53    Returns:
 54        df (gpd.GeoDataFrame): A GeoDataFrame loaded from the filepath containing ROI polygons and file paths to associated clipped DEMs.
 55    """
 56    df = gpd.read_file(filePath, truncation=False)
 57    recoveryAttempted = False  # Flag to track whether recovery attempt has been made
 58    recoverySuccessful = False
 59        
 60    # Get raster and image column names
 61    rasterCols = [col for col in df.columns if col.startswith(ROIDF_RSTR_COL_PATTERN)]
 62    imageCols = [col for col in df.columns if col == ROIDF_IMG_COL_PATTERN]
 63    
 64
 65    if checkFilePaths:
 66        #The following loops through the rows and raster columns and checks if raster/image paths are valid. If not, will attempt to find filepaths within the
 67        #df directory following the directory structure of the lidar label builder.
 68        for index, row in df.iterrows():
 69            for col in rasterCols:
 70                path = row[col]
 71                if not os.path.exists(path):
 72                    if not recoveryAttempted:
 73                        print(f'Unable to locate some filepaths. Attempting to recover filepaths within the specified directory.')
 74                        recoveryAttempted = True
 75                    directory = os.path.split(filePath)[0]
 76                    fname = os.path.split(path)[1]
 77                    tryPath = os.path.join(directory, CLIPPED_ROI_FOLDER, fname)
 78                    if os.path.exists(tryPath):
 79                        df.at[index, col] = tryPath
 80                        recoverySuccessful=True
 81                        
 82                    else:
 83                        raise ValueError(f"Unable to locate necessary filepaths.\nAttempted Path 1: {path}\nAttempted Path 2: {tryPath}")
 84            for col in imageCols:
 85                path = row[col]
 86                if not os.path.exists(path):
 87                    if not recoveryAttempted:
 88                        print(f'Unable to locate some filepaths. Attempting to recover filepaths within the specified directory.')
 89                        recoveryAttempted = True
 90                    directory = os.path.split(filePath)[0]
 91                    fname = os.path.split(path)[1]
 92                    tryPath = os.path.join(directory, IMG_FOLDER, fname)
 93                    if os.path.exists(tryPath):
 94                        df.at[index, col] = tryPath
 95                        recoverySuccessful = True
 96                        
 97                    else:
 98                        raise ValueError(f"Unable to locate necessary filepaths.\nAttempted Path 1: {path}\nAttempted Path 2: {tryPath}")
 99        
100        #if files were recovered saves a new df file with the recovered raster/image filepaths
101        if recoverySuccessful:
102            basePath, ext = os.path.splitext(filePath)
103            savePath = f'{basePath}_recovered{ext}'
104            df.to_file(savePath)
105    
106    return df

Loads df from filepaths. Checks df to see if filepaths within the df exist and attempts recovery within the database if not.

Arguments:
  • filePath (str): Path to geodatabase where clipped rasters and labels are stored.
  • checkFilePaths (bool): Whether to check if filepaths within the df exist. Default to False.
Returns:

df (gpd.GeoDataFrame): A GeoDataFrame loaded from the filepath containing ROI polygons and file paths to associated clipped DEMs.

def filter_df( df: geopandas.geodataframe.GeoDataFrame, dataColumn: str, labelColumn: str, labelsToRemove=None):
108def filter_df(df:gpd.GeoDataFrame, dataColumn:str, labelColumn:str, labelsToRemove=None):
109    """Filter a GeoDataFrame by keeping only specified data and label columns and removing entries based on conditions.
110
111    Args:
112        df (geopandas.GeoDataFrame): Input GeoDataFrame containing data and label columns.
113        dataColumn (str): A string matching the column name for the column containing filepaths to dem grids or images.
114        labelColumn (str): Name of the column containing labels.A string matching the column name for the label column containing labels associated with your data column.
115        labelsToRemove (str or list, optional): Label or list of labels to be removed from the GeoDataFrame. Defaults to None. 
116
117    Returns:
118        filteredDf (geopandas.GeoDataFrame): Filtered GeoDataFrame with only specified data and label columns,
119        and entries removed based on specified conditions.
120    """
121    #Remove all columns that are not data and label columns
122    colsToKeep = [dataColumn, labelColumn, 'geometry']
123    df = df[colsToKeep]
124
125    #remove entries that have no data or that are in the list labelsToRemove
126    df = df.dropna(subset=[labelColumn])
127    if labelsToRemove:
128        if isinstance(labelsToRemove, str):
129            labelsToRemove = [labelsToRemove]
130        filteredDf = df[~df[labelColumn].isin(labelsToRemove)]
131    else:
132        filteredDf = df.copy()
133    return filteredDf

Filter a GeoDataFrame by keeping only specified data and label columns and removing entries based on conditions.

Arguments:
  • df (geopandas.GeoDataFrame): Input GeoDataFrame containing data and label columns.
  • dataColumn (str): A string matching the column name for the column containing filepaths to dem grids or images.
  • labelColumn (str): Name of the column containing labels.A string matching the column name for the label column containing labels associated with your data column.
  • labelsToRemove (str or list, optional): Label or list of labels to be removed from the GeoDataFrame. Defaults to None.
Returns:

filteredDf (geopandas.GeoDataFrame): Filtered GeoDataFrame with only specified data and label columns, and entries removed based on specified conditions.

def import_filter_concat_df(path, dataColumn: str, labelColumn: str, labelsToRemove=None):
135def import_filter_concat_df(path, dataColumn:str, labelColumn:str, labelsToRemove=None):
136    """
137    Import and filter GeoDataFrames from shapefiles or directories of shapefiles. If multiple shapefiles will concatenate into one single filtered df.
138
139    Args:
140        path (str or list): File path to a shapefile, directory of shapeiles, or a list of file paths to shapefiles. All shapefiles must contain a dataColumn and a labelColumn.
141        dataColumn (str): A string matching the column name for the column containing filepaths to dem grids or images.
142        labelColumn (str): A string matching the column name for the label column containing labels associated with your data column.
143        labelsToRemove (str or list, optional): Label or list of labels to be removed from the GeoDataFrame. Defaults to None. 
144
145    Returns:
146        filteredDf (gpd.GeoDataFrame): A compiled geopandas GeoDataFrame with the columns associated with data, labels, and geometry with any columns containng labels matching a value in the 
147        labelsToRemove argument removed from the dataframe.
148
149    Raises:
150        ValueError: If the input 'path' is of an unsupported type or if the file type or directory structure is not valid.
151    """
152    #if the input path is a string (a path to one location)
153    if isinstance(path, str):
154        if os.path.isfile(path) and LABEL_FILE_PATTERN in path and path.lower().endswith(LABEL_FILE_EXT): #If this string is a file (not a directory) and matches the label file convention specified in global variables, then load that single df
155            df = load_df(path)
156            filteredDf = filter_df(df, dataColumn, labelColumn, labelsToRemove=labelsToRemove) #drop any labels that are na or that match a label in labelsToRemove
157        elif os.path.isdir(path): #If the path is to a directory, load each individual matching the label file pattern defined in global variables and concatenate into one df
158            dfs=[]
159            labelDfs = []
160            for root, dirs, files in os.walk(path):
161                for file in files:
162                    if LABEL_FILE_PATTERN in file and file.lower().endswith(LABEL_FILE_EXT):
163                        filePath=os.path.join(root, file)
164                        labelDfs.append(filePath)
165                        df = load_df(filePath)
166                        filteredDf = filter_df(df, dataColumn, labelColumn, labelsToRemove=labelsToRemove) #drop any labels that are na or that match a label in labelsToRemove
167                        dfs.append(filteredDf)
168            if len(dfs) > 1:
169                print('Joining Dataframes: ' + str(labelDfs))
170                filteredDf = pd.concat(dfs, ignore_index=True) #join into one df
171            else:
172                filteredDf = dfs[0] #if just one df in the directory extract that single directory from the dfs list.
173
174        else:
175            raise ValueError(f"Unsupported file type or directory structure for path: {path}")
176    elif isinstance(path, list): #If given a list of filepaths to label files, load filter and join these into one
177        dfs = []
178        for p in path:
179            if os.path.isfile(p) and p.lower().endswith('.shp'):
180                df = load_df(p)
181                filteredDf = filter_df(df, dataColumn, labelColumn, labelsToRemove=labelsToRemove)
182                dfs.append(filteredDf)
183            elif os.path.isdir(path):
184                dfs=[]
185                labelDfs = []
186                for root, dirs, files in os.walk(path):
187                    for file in files:
188                        if LABEL_FILE_PATTERN in file and file.lower().endswith(LABEL_FILE_EXT):
189                            filePath=os.path.join(root, file)
190                            labelDfs.append(filePath)
191                            df = load_df(filePath)
192                            filteredDf = filter_df(df, dataColumn, labelColumn, labelsToRemove=labelsToRemove) #drop any labels that are na or that match a label in labelsToRemove 
193                            dfs.append(filteredDf)
194                print('Joining Dataframes: ' + str(labelDfs))
195                filteredDf = pd.concat(dfs, ignore_index=True) #join into one
196            else:
197                print(f"Unsupported input type for {p}. Extension must be {LABEL_FILE_EXT}.")
198              
199        filteredDf = pd.concat(dfs, ignore_index=True)
200    else:
201        raise ValueError(f"Unsupported input type for {path}. Must be a string or a list of strings.")
202    
203    return filteredDf

Import and filter GeoDataFrames from shapefiles or directories of shapefiles. If multiple shapefiles will concatenate into one single filtered df.

Arguments:
  • path (str or list): File path to a shapefile, directory of shapeiles, or a list of file paths to shapefiles. All shapefiles must contain a dataColumn and a labelColumn.
  • dataColumn (str): A string matching the column name for the column containing filepaths to dem grids or images.
  • labelColumn (str): A string matching the column name for the label column containing labels associated with your data column.
  • labelsToRemove (str or list, optional): Label or list of labels to be removed from the GeoDataFrame. Defaults to None.
Returns:

filteredDf (gpd.GeoDataFrame): A compiled geopandas GeoDataFrame with the columns associated with data, labels, and geometry with any columns containng labels matching a value in the labelsToRemove argument removed from the dataframe.

Raises:
  • ValueError: If the input 'path' is of an unsupported type or if the file type or directory structure is not valid.
def make_lookup( labelLookupPath: str, compiledDf: geopandas.geodataframe.GeoDataFrame, labelColumn: str):
205def make_lookup(labelLookupPath:str, compiledDf:gpd.GeoDataFrame, labelColumn:str):
206    """Creates a lookup dictionary that maps integer values to unique string labels in the label column. Note: this is done in order of appearance for the label. If you 
207    wish to have specific labels mapped to specific integer values, it is best to make this lookup dictionary manually.
208
209    Args:
210        labelLookupPath (str): The path to a lookup dictionary that maps integer values to string labels. Alternatively this function can make a lookup at this path if one 
211        does not currently exist. 
212        compiledDf (gpd.GeoDataFrame): The compiled DataFrame containig all training data. This DataFrame should have a column with labels that you want to map to integer values.
213        labelColumn (str): A string matching the column name for the label column containing labels associated with your data column
214    """
215    uniqueLabels = compiledDf[labelColumn].unique() #locates all unique label options in df
216    labelDict = {k:v for k,v in enumerate(uniqueLabels)} #makes a dictionary
217    with open(F"{labelLookupPath}", 'w') as f:
218        json.dump(labelDict, f) #saves to filepath

Creates a lookup dictionary that maps integer values to unique string labels in the label column. Note: this is done in order of appearance for the label. If you wish to have specific labels mapped to specific integer values, it is best to make this lookup dictionary manually.

Arguments:
  • labelLookupPath (str): The path to a lookup dictionary that maps integer values to string labels. Alternatively this function can make a lookup at this path if one
  • does not currently exist.
  • compiledDf (gpd.GeoDataFrame): The compiled DataFrame containig all training data. This DataFrame should have a column with labels that you want to map to integer values.
  • labelColumn (str): A string matching the column name for the label column containing labels associated with your data column
def get_raster_as_grid(raster):
220def get_raster_as_grid(raster):
221    """Loads in a digital elevation model as a numpy array and converts any no-data to np.nan
222
223        Args:
224            raster (str OR gdal.Dataset): The path to a single band, gdal readable, digital elevation model OR
225                an already loaded raster dataset
226
227        Returns:
228            rasterGrid (numpy.ndarray): The elevation data stored as a grid
229            dx (float): The x coordinate spacing of the grid
230            dy (float): The y coordinate spacing of the grid
231
232        Raises:
233            Exception: Input is neither a path to a raster or a gdal.Dataset
234        """
235    
236    #If this is a raster dataset
237    if isinstance(raster,gdal.Dataset):
238        doClose = False
239    
240    #If this is a file
241    elif os.path.isfile(raster):   
242        #Get the raster grid
243        raster = gdal.Open(raster)
244        
245
246        #Close this file after the operation completes
247        doClose = True
248    else:
249         Exception('Specified raster is neither a path to a raster or a gdal.Dataset. Please specify a valid path.')
250
251    rasterGrid = raster.ReadAsArray().astype(float)
252    NDV = raster.GetRasterBand(1).GetNoDataValue()
253
254    #Mask out NDVs as nan
255    rasterGrid[rasterGrid==NDV] = np.nan
256
257    # Grab the basic header information (xUL, dx, rot1, yUL, rot2, dy)
258    geotransform = raster.GetGeoTransform()  
259    
260    dx = geotransform[1]
261    dy = geotransform[-1]
262
263    if doClose:
264        raster = None #Close the raster
265
266    return rasterGrid, dx, dy

Loads in a digital elevation model as a numpy array and converts any no-data to np.nan

Arguments:
  • raster (str OR gdal.Dataset): The path to a single band, gdal readable, digital elevation model OR an already loaded raster dataset
Returns:

rasterGrid (numpy.ndarray): The elevation data stored as a grid dx (float): The x coordinate spacing of the grid dy (float): The y coordinate spacing of the grid

Raises:
  • Exception: Input is neither a path to a raster or a gdal.Dataset
def load_rasters( df: geopandas.geodataframe.GeoDataFrame, dataColumn: str, roiWidth: int, dataType: str = 'dem'):
268def load_rasters(df:gpd.GeoDataFrame, dataColumn:str, roiWidth:int, dataType:str = 'dem'):
269    """Loads the rasters depending on the datatype of the data column. Can either be a digital elevation model, or an image (1 or 3 band)
270
271    Args:
272        df (gpd.GeoDataFrame): geopandas GeoDataFrame containing a datacolumn containing paths to digital elevation models or images.
273        dataColumn (str): A string matching the column name for the column containing filepaths to dem grids or images.
274        roiWidth (int): The width of the raster grids. 
275        dataType (str, optional): String describing the type of data in the data column. Options: 'dem', 'image1' or 'image3'. Defaults to 'dem'.
276
277    Returns:
278        dfLoadedRasters (gpd.GeoDataFrame): A geopandas GeoDataFrame with a new column containing the loaded raster data.
279    """
280    dataPaths = df[dataColumn].tolist()
281    grids = []
282
283    for path in dataPaths:
284        try:
285            if dataType == 'dem':
286                grid = get_raster_as_grid(path)[0]
287
288            elif dataType == 'image1':
289                grid = np.asarray(Image.open(path).convert('L'))
290
291            elif dataType == 'image3':
292                grid = np.asarray(Image.open(path).convert('RGB'))
293                
294            else:
295                raise(ValueError('Incorrect data type specified. Options: raster or image'))
296        except Exception as e:
297            print(f'Error processing {path}: {e}. A Nan grid will be used in its place. ')
298            grid = np.full((roiWidth, roiWidth), np.nan)
299        grids.append(grid)
300    
301    df[X_COL_LABEL] = grids #place loaded grids into the X column with a label specified in global variables.
302
303    dfLoadedRasters = df.copy()
304
305    return dfLoadedRasters

Loads the rasters depending on the datatype of the data column. Can either be a digital elevation model, or an image (1 or 3 band)

Arguments:
  • df (gpd.GeoDataFrame): geopandas GeoDataFrame containing a datacolumn containing paths to digital elevation models or images.
  • dataColumn (str): A string matching the column name for the column containing filepaths to dem grids or images.
  • roiWidth (int): The width of the raster grids.
  • dataType (str, optional): String describing the type of data in the data column. Options: 'dem', 'image1' or 'image3'. Defaults to 'dem'.
Returns:

dfLoadedRasters (gpd.GeoDataFrame): A geopandas GeoDataFrame with a new column containing the loaded raster data.

def rm_invalid_raster_rows( dfLoadedRasters, roiWidth: int, rasterCol: str = None, idx: int = None):
307def rm_invalid_raster_rows(dfLoadedRasters, roiWidth:int, rasterCol:str=None, idx:int=None):
308    """Removes any rows with nan, infinite, or negative values or having a standard deviation of 0 in a dataframe with a column of loaded raster arrays.
309
310    Args:
311        dfLoadedRasters (pd.DataFrame or gpd.GeoDataFrame): Dataframe or GeoDataFrame containing raster data loded as arrays.
312        roiWidth (int): The width of the raster grids. Any rasters with dimensions that are not roiWidth x roiWidth will be removed.
313        rasterCol (str): The label for the column containing raster arrays
314        idx (int): If using this function to preprocess the rasters for a gridded database that is stored in a larger database, the index value
315        can be specified to be included in printout. Defaults to None.
316    Returns:
317        dfNanRm (pd.DataFrame or gpd.GeoDataFrame): Updated DataFrame after handling NaN values.
318    """
319    if not rasterCol:
320        rasterCol = X_COL_LABEL
321
322    dfNanRm = dfLoadedRasters.copy() 
323
324    # Initialize counters for each condition
325    countNan = 0
326    countInf = 0
327    countNegative = 0
328    countZeroStdv = 0
329    countInvalidShape = 0
330
331    rowsToDrop = []
332    for i, row in dfLoadedRasters.iterrows():
333        dataArray = row[rasterCol]
334        containsNan = np.any(np.isnan(dataArray))
335        containsInf = np.any(np.isinf(dataArray))
336        stdv = np.nanstd(dataArray)
337        containsNegative = np.any(dataArray < 0)
338        invalidShape = dataArray.shape != (roiWidth, roiWidth)
339        # Check each condition and update counters
340        if containsNan:
341            countNan += 1
342        if containsInf:
343            countInf += 1
344        if containsNegative:
345            countNegative += 1
346        if stdv == 0:
347            countZeroStdv += 1
348        if invalidShape:
349            countInvalidShape += 1
350
351
352        if containsNan or containsInf or containsNegative or stdv == 0 or invalidShape:
353            rowsToDrop.append(i)
354        
355    #Drop Rows with Nan vals or 0 standard deviation
356    dfNanRm.drop(rowsToDrop, inplace=True)
357    # Reset index after removing rows
358    dfNanRm = dfNanRm.reset_index(drop=True)
359
360    if len(rowsToDrop) > 0:
361        if idx:
362            print(f"The following rows were dropped for idx {idx} the respective reasons:")
363        else:
364            print("The following rows were dropped for the respective reasons:")
365        if countNan > 0:
366            print(f" - Containing NaN values: {countNan}")
367        if countInf > 0:
368            print(f" - Containing infinite values: {countInf}")
369        if countNegative > 0:
370            print(f" - Containing negative values: {countNegative}")
371        if countZeroStdv > 0:
372            print(f" - Having a standard deviation of 0: {countZeroStdv}")
373        if countInvalidShape > 0:
374            print(f" - Not having the shape ({roiWidth}, {roiWidth}): {countInvalidShape}")
375    return dfNanRm

Removes any rows with nan, infinite, or negative values or having a standard deviation of 0 in a dataframe with a column of loaded raster arrays.

Arguments:
  • dfLoadedRasters (pd.DataFrame or gpd.GeoDataFrame): Dataframe or GeoDataFrame containing raster data loded as arrays.
  • roiWidth (int): The width of the raster grids. Any rasters with dimensions that are not roiWidth x roiWidth will be removed.
  • rasterCol (str): The label for the column containing raster arrays
  • idx (int): If using this function to preprocess the rasters for a gridded database that is stored in a larger database, the index value
  • can be specified to be included in printout. Defaults to None.
Returns:

dfNanRm (pd.DataFrame or gpd.GeoDataFrame): Updated DataFrame after handling NaN values.

def make_even_distribution(df: geopandas.geodataframe.GeoDataFrame, doPrint: bool = True):
377def make_even_distribution(df:gpd.GeoDataFrame, doPrint:bool=True):
378    """Adjusts the distribution of labels in a GeoDataFrame to be even by removing rows from any label that doesn't have the smallest number of labels.
379
380    Args:
381        df (gpd.GeoDataFrame): The GeoDataFrame containing the data. This DataFrame should have a column with labels that you want to distribute evenly.
382        doPrint (bool, optional): If True, prints information about the labels being removed to achieve even distribution. Defaults to True.
383
384    Returns:
385        evenlyDistributedDf (gpd.GeoDataFrame): A new GeoDataFrame with an adjusted label distribution.
386    """
387    
388    uniqueLabs = df[Y_COL_LABEL].unique()
389    minCount = df[Y_COL_LABEL].value_counts().min()
390
391    # Calculate the number of rows to remove for each label
392    excessCount = df[Y_COL_LABEL].value_counts() - minCount
393    excessCount = excessCount[excessCount > 0]
394    
395    for label, count in excessCount.items():
396        labelIndices = df[df[Y_COL_LABEL] == label].index
397        numToRemove = int(count)
398        randomIndices = np.random.choice(labelIndices, size=numToRemove, replace=False)
399        df = df.drop(randomIndices)
400        
401        if doPrint:
402            print(f'Removing {numToRemove} labels with label {label} to achieve the desired distribution.')
403
404    return df

Adjusts the distribution of labels in a GeoDataFrame to be even by removing rows from any label that doesn't have the smallest number of labels.

Arguments:
  • df (gpd.GeoDataFrame): The GeoDataFrame containing the data. This DataFrame should have a column with labels that you want to distribute evenly.
  • doPrint (bool, optional): If True, prints information about the labels being removed to achieve even distribution. Defaults to True.
Returns:

evenlyDistributedDf (gpd.GeoDataFrame): A new GeoDataFrame with an adjusted label distribution.

def subset_df( df: geopandas.geodataframe.GeoDataFrame, subsetSize: int, evenlyDistributeDf: bool = False, doPrint: bool = True):
407def subset_df(df:gpd.GeoDataFrame, subsetSize:int, evenlyDistributeDf:bool = False, doPrint:bool = True):
408    """This function creates a subset of the input GeoDataFrame, optionally ensuring that the distribution of labels is even before subsetting.
409
410    Args:
411        df (gpd.GeoDataFrame): The input GeoDataFrame containing the data. This DataFrame should have a column with labels that you want to subset.
412        subsetSize (int): The desired size of the subset.
413        evenlyDistributeDf (bool, optional): If True, adjusts the DataFrame to have an even distribution of labels before subsetting. Defaults to False.
414        doPrint (bool, optional): If True, prints information about the subsetting process, including the number of samples kept for each label. Defaults to True.
415
416    Returns:
417        gpd.GeoDataFrame: A new GeoDataFrame that is a subset of the input DataFrame with the specified size and optional even label distribution.
418    """
419    
420    uniqueLabs = df[Y_COL_LABEL].unique()
421    labelCounts = df[Y_COL_LABEL].value_counts(normalize=True)
422    currentDistribution = pd.Series(labelCounts.values, index=labelCounts.index).sort_index()
423
424
425    samplesToKeep = (currentDistribution*subsetSize).astype(int)
426    indicesToKeep = []
427
428    if evenlyDistributeDf:
429        dfToSubset = make_even_distribution(df, doPrint=False)
430    else:
431        dfToSubset = df.copy()
432
433    for label in uniqueLabs:
434        labelIndices = dfToSubset[dfToSubset[Y_COL_LABEL] == label].index.tolist()
435        subset = np.random.choice(labelIndices, samplesToKeep[label], replace = False)
436        indicesToKeep.extend(subset)
437        if doPrint:
438            print(f'Keeping {len(subset)} of {len(labelIndices)} for label: {label} to obtain a dataframe of size: {sum(samplesToKeep)}')
439            print(f'Distribution for label {label}: {round(currentDistribution[label], 2)}')
440
441    subsetDf = dfToSubset.loc[indicesToKeep]
442
443    return subsetDf

This function creates a subset of the input GeoDataFrame, optionally ensuring that the distribution of labels is even before subsetting.

Arguments:
  • df (gpd.GeoDataFrame): The input GeoDataFrame containing the data. This DataFrame should have a column with labels that you want to subset.
  • subsetSize (int): The desired size of the subset.
  • evenlyDistributeDf (bool, optional): If True, adjusts the DataFrame to have an even distribution of labels before subsetting. Defaults to False.
  • doPrint (bool, optional): If True, prints information about the subsetting process, including the number of samples kept for each label. Defaults to True.
Returns:

gpd.GeoDataFrame: A new GeoDataFrame that is a subset of the input DataFrame with the specified size and optional even label distribution.

def process_grids( df: geopandas.geodataframe.GeoDataFrame, scalingMethod: str = 'min_max', doApplyGaussianFilter: bool = False, sigma: float = None):
445def process_grids(df:gpd.GeoDataFrame, scalingMethod:str='min_max', doApplyGaussianFilter:bool = False, sigma:float=None):
446    """ Iterates through the rows of a df and processed the dem grids to specifications. This function will normalize and rescale the 
447    raster grids based on the specified method (min-max or z-score). It also has the option to apply a gaussian filter and augment the
448    grids by flipping up and down and rotating +90 and -90 degrees. 
449
450    Args:
451        df (gpd.GeoDataFrame): The input dataframe containing loaded raster grids to process.
452        scalingMethod (str, optional): The normalization and rescaling method.
453        - 'min_max': Min-max scaling method. Will result in a grid with values ranging from 0 to 1.
454        - 'z_score': Z-score scaling method. Will transform grids into a standard normal distribution with a mean of 0 and a standard deviation of 1
455        (Default: 'z_score_scaling')
456        doApplyGaussianFilter: Specifies if a gaussian filter will be applied to each grid. Defaults to False.
457        sigma (float, optional): Sigma value for gaussian filter. Defaults to None. If not specified but gaussian filter argument set to true will default to 1.
458        dataframe. Augmentation includes flipping each grid up and down and rotating each grid +90 and -90 degrees. The original row from of the input
459        is copied, corresponding geometry is removed and these augmented rows are joined with the input dataframe. 
460
461    Returns:
462        processedDf (gpd.GeoDataFrame): The processed Df with normalized, rescaled, filtered, and augmented grids.
463    """
464    processedDf = df.copy()
465
466    for i, row in processedDf.iterrows():
467        grid = row[X_COL_LABEL]
468
469        #Apply a gaussian filter to slightly blur/smooth dem
470        if doApplyGaussianFilter:
471            if not sigma:
472                print("No sigma specified for gaussian filter. Will default to sigma = 1")
473                sigma = 1
474            gridToRescale = gaussian_filter(grid, sigma = sigma, mode = 'nearest')
475        else:
476            gridToRescale = grid
477
478
479        #Rescales DEMS by either min max) or zscore
480        if scalingMethod == 'min_max':
481            minVal = np.nanmin(gridToRescale)
482            maxVal = np.nanmax(gridToRescale)
483            normGrid = (gridToRescale-minVal)/(maxVal-minVal)
484        elif scalingMethod == 'z_score':
485            mean_value = np.nanmean(gridToRescale)
486            std_deviation = np.nanstd(gridToRescale)
487            normGrid = (gridToRescale - mean_value) / std_deviation
488        else:
489            raise(Warning("No scaling method specified. Grids will not be scaled or normalized."))
490        
491        processedDf.at[i, X_COL_LABEL] = normGrid
492
493    return processedDf

Iterates through the rows of a df and processed the dem grids to specifications. This function will normalize and rescale the raster grids based on the specified method (min-max or z-score). It also has the option to apply a gaussian filter and augment the grids by flipping up and down and rotating +90 and -90 degrees.

Arguments:
  • df (gpd.GeoDataFrame): The input dataframe containing loaded raster grids to process.
  • scalingMethod (str, optional): The normalization and rescaling method.
  • - 'min_max': Min-max scaling method. Will result in a grid with values ranging from 0 to 1.
  • - 'z_score': Z-score scaling method. Will transform grids into a standard normal distribution with a mean of 0 and a standard deviation of 1
  • (Default: 'z_score_scaling')
  • doApplyGaussianFilter: Specifies if a gaussian filter will be applied to each grid. Defaults to False.
  • sigma (float, optional): Sigma value for gaussian filter. Defaults to None. If not specified but gaussian filter argument set to true will default to 1.
  • dataframe. Augmentation includes flipping each grid up and down and rotating each grid +90 and -90 degrees. The original row from of the input
  • is copied, corresponding geometry is removed and these augmented rows are joined with the input dataframe.
Returns:

processedDf (gpd.GeoDataFrame): The processed Df with normalized, rescaled, filtered, and augmented grids.

def shuffle_df(df: geopandas.geodataframe.GeoDataFrame):
495def shuffle_df(df:gpd.GeoDataFrame):
496    """Shuffles the rows of a DataFrame.
497
498    Args:
499        df (gpd.GeoDataFrame): The input GeoDataFrame to be shuffled.
500
501    Returns:
502        shuffledDf (gpd.GeoDataFrame): A new GeoDataFrame with shuffled rows.
503    """
504    shuffleIndices = np.random.permutation(len(df))
505    shuffledDf = df.iloc[shuffleIndices].reset_index(drop=True)
506    return shuffledDf

Shuffles the rows of a DataFrame.

Arguments:
  • df (gpd.GeoDataFrame): The input GeoDataFrame to be shuffled.
Returns:

shuffledDf (gpd.GeoDataFrame): A new GeoDataFrame with shuffled rows.

def make_arrays( df: geopandas.geodataframe.GeoDataFrame, dataType: str = 'dem', dataCol: str = None, labelCol: str = None):
508def make_arrays(df:gpd.GeoDataFrame, dataType:str = 'dem', dataCol:str = None, labelCol:str = None):
509    """Converts columns of a GeoDataFrame into data and label arrays for use in machine learning models.
510
511    Args:
512        df (gpd.GeoDataFrame): The input GeoDataFrame containing the data and labels.
513        dataType (str, optional): The type of data, either 'dem' or 'image1' for single band data or 'image3' for three-band data. Defaults to 'dem'.
514        dataCol (str, optional): The column name for the data. Defaults to None, in which case it uses the global variable X_COL_LABEL if available.
515        labelCol (str, optional): The column name for the labels. Defaults to None, in which case it uses the global variable Y_COL_LABEL if available.
516
517    Returns:
518        tuple: A tuple containing:
519            - dataArray (np.ndarray or None): The array of data, with shape (num_samples, nrows, ncols, nbands).
520            - labelArray (np.ndarray or None): The array of labels, with shape (num_samples, 1).
521    """
522
523    if df.empty:
524        return None, None
525
526    if not dataCol and X_COL_LABEL in df.columns:
527        dataCol = X_COL_LABEL
528    if not labelCol and Y_COL_LABEL in df.columns:
529        labelCol = Y_COL_LABEL
530    # Initialize arrays to None
531    dataArray = None
532    labelArray = None
533
534    if dataCol:
535        nrows = df[dataCol].iloc[0].shape[0]
536        ncols = df[dataCol].iloc[0].shape[1]
537
538        if dataType == 'image3':
539            nbands = 3
540
541        elif dataType == 'image1' or 'dem':
542            nbands = 1
543        else:
544            raise ValueError("Incorrect datatype specified for dataType. Must be either 'dem', 'image3' or 'image1'.")
545
546        dataArray = np.zeros((len(df), nrows, ncols,nbands))
547
548        for i in range(len(df)):
549            dataArray[i,:,:,:] = df[dataCol].iloc[i].reshape(nrows,ncols,nbands)
550    if labelCol:
551        labelArray = df[labelCol].values.reshape((len(df), 1)).astype(int)
552
553    return dataArray, labelArray

Converts columns of a GeoDataFrame into data and label arrays for use in machine learning models.

Arguments:
  • df (gpd.GeoDataFrame): The input GeoDataFrame containing the data and labels.
  • dataType (str, optional): The type of data, either 'dem' or 'image1' for single band data or 'image3' for three-band data. Defaults to 'dem'.
  • dataCol (str, optional): The column name for the data. Defaults to None, in which case it uses the global variable X_COL_LABEL if available.
  • labelCol (str, optional): The column name for the labels. Defaults to None, in which case it uses the global variable Y_COL_LABEL if available.
Returns:

tuple: A tuple containing: - dataArray (np.ndarray or None): The array of data, with shape (num_samples, nrows, ncols, nbands). - labelArray (np.ndarray or None): The array of labels, with shape (num_samples, 1).

def preprocess_df( path: str, dataColumn: str, roiWidth: int, labelColumn: str, labelLookupPath: str, label: str, saveDirectory: str, dataType: str = 'dem', labelsToRemove=None, scalingMethod: str = 'z_score', doApplyGaussianFilter: bool = True, sigma: float = 0.5, evenlyDistributeDf: bool = False, subset: int = None):
555def preprocess_df(path:str, dataColumn:str, roiWidth:int, labelColumn:str, labelLookupPath:str, label:str, saveDirectory:str, 
556                  dataType:str = 'dem', labelsToRemove = None, scalingMethod:str = 'z_score', 
557                  doApplyGaussianFilter:bool = True, sigma:float = 0.5,
558                  evenlyDistributeDf:bool = False, subset:int = None
559                  ):
560    """Preprocesses a GeoDataFrame by importing, filtering, joining, and transforming data columns and labels for training a convolutional neural network.
561
562    Required Args:
563        path (str or list): File path to a shapefile, directory of shapefiles, or a list of file paths to shapefiles. All shapefiles must contain a dataColumn and a labelColumn.
564        dataColumn (str): A string matching the column name for the column containing filepaths to dem grids or images.
565        roiWidth (int): The width of the raster grids. Any rasters with dimensions that are not roiWidth x roiWidth will be removed.
566        labelColumn (str): A string matching the column name for the label column containing labels associated with your data column.
567        labelLookupPath (str): The path to a lookup dictionary that maps integer values to string labels. Alternatively this function can make a lookup at this path if one 
568        does not currently exist. 
569        label (str): The label used to uniquely identify the output files.
570        saveDirectory (str): The path to the desired output directory.
571    Optional Args:
572        dataType (str, optional): String describing the type of data in the data column. Options: 'dem', 'image1' or 'image3'. Defaults to 'dem'.
573        labelsToRemove (str or list): Label or list of labels to be removed from the GeoDataFrame. Defaults to None. 
574        scalingMethod (str, optional): The normalization and rescaling method.
575        - 'min_max': Min-max scaling method. Will result in a grid with values ranging from 0 to 1.
576        - 'z_score': Z-score scaling method. Will transform grids into a standard normal distribution with a mean of 0 and a standard deviation of 1
577        (Default: 'z_score_scaling')
578        doApplyGaussianFilter (bool, optional): Specifies if a gaussian filter should be applied to grids. This is essentially a bluring/averaging filter that can help 
579        remove irregularities/artificats in data. Defaults to True.
580        sigma (float, optional): The sigma value used for the gaussian filter. Defaults to 0.5.
581        evenlyDistributeDf (bool, optional): When true, will remove some columns from the dataframe to create an even distribution of labels. Defaults to False.
582        subset (int, optional): When a number is specified, columns will be removed randomly to produce a dataframe of this size. 
583    Returns:
584        tuple: A tuple containing:
585            - dataArray (np.ndarray): The processed data array.
586            - labelArray (np.ndarray): The processed label array.
587            - outDf (gpd.GeoDataFrame): The processed GeoDataFrame with the original raster paths, labels and geometry with an additional column containing integer mappings.
588    """
589    savebaseName = os.path.join(saveDirectory, '{}_{{}}.npy'.format(label))
590   
591    dataArrayPath = savebaseName.format('data')
592    labelArrayPath = savebaseName.format('labels')
593
594    dfOutPath = os.path.join(saveDirectory, f'{label}_df.shp')
595
596    overwrite = True
597
598    if os.path.exists(dataArrayPath) and os.path.exists(labelArrayPath) and os.path.exists(dfOutPath):
599        qStr = "Existing files for data and label arrays detected. Proceed with preprocessing and overwrite existing files (y/n)?"
600        overwrite = lt._request_yn_input(qStr)
601
602    if overwrite:
603        print(f'Importing, filtering, and joining into one dataframe...')
604        compiledDf = import_filter_concat_df(path, dataColumn, labelColumn, labelsToRemove=labelsToRemove)
605        
606        #connect to dictionary with labels and int vals or make one
607        if not os.path.exists(labelLookupPath):
608            qstr = "No label lookup exists at this filepath. Would you like to make one (y/n)."
609            makeLookup = lt._request_yn_input(qstr)
610            if makeLookup:
611                make_lookup(labelLookupPath, compiledDf, labelColumn)
612            else:
613                raise ValueError('No label lookup exists at filepath. Please make a lookup! :~)')
614        with open(labelLookupPath, 'r') as f:
615            labelDict = json.load(f)
616
617        #Make a column for int labels
618        labelDictRev = {labelDict[key]:key for key in labelDict}
619        compiledDf[Y_COL_LABEL] = compiledDf[labelColumn].apply(lambda val: labelDictRev[val])
620
621        if subset:
622            print(f"Subsetting the dataframe to subset size: {subset}")
623            dfToLoad = subset_df(compiledDf, subset, evenlyDistributeDf=evenlyDistributeDf, doPrint=False)
624        else:
625            dfToLoad = compiledDf
626
627        #load rasters and rescale
628        print('Loading rasters...')
629        dfLoadedRasters = load_rasters(dfToLoad, dataColumn, roiWidth, dataType= dataType)
630        
631        print('Removing entries with Nan vals...')
632        # Handle nan vals in arrays
633        dfCleaned = rm_invalid_raster_rows(dfLoadedRasters, roiWidth)
634
635        #option to remove some labels to make an even distribution
636        if evenlyDistributeDf:
637            
638            distributedDf = make_even_distribution(dfCleaned)
639        else:
640            distributedDf = dfCleaned
641
642        if subset:
643            subsetDf = subset_df(distributedDf, subset)
644        else:
645            subsetDf = distributedDf
646
647        if not scalingMethod and not doApplyGaussianFilter:
648            processedDf = subsetDf
649        else:
650            print(f'Processing grids with:\nscaling method: {scalingMethod}\ngaussian filter: {doApplyGaussianFilter}, sigma: {sigma}')
651            processedDf = process_grids(subsetDf, scalingMethod=scalingMethod, doApplyGaussianFilter=doApplyGaussianFilter, sigma=sigma)
652
653        #Shuffles Df
654        shuffledDf = shuffle_df(processedDf)
655
656        print('Converting data and labels to arrays...')
657        dataArray, labelArray = make_arrays(shuffledDf, dataType = dataType)
658        
659        if dataArray is not None and labelArray is not None:
660            #Save Data and Label Arrays
661            np.save(dataArrayPath, dataArray)
662            np.save(labelArrayPath, labelArray)
663
664            outDf = shuffledDf.drop(columns=[X_COL_LABEL])
665            
666            outDf.to_file(dfOutPath)
667       
668            #Print Paths
669            print(f'Data Array Path: {dataArrayPath}\nLabel Array Path: {labelArrayPath}\nDataframe Path: {dfOutPath}')
670        else:
671            print("All df rows removed during preprocessing due to invalid raster data. No arrays or df's saved.")
672            return None, None, None
673    
674    else:
675        #Load df from existing file
676        print(f'Loading from preexisting files.\nData Array Path: {dataArrayPath}\nLabel Array Path: {labelArrayPath}\nDf Path: {dfOutPath}')
677        dataArray = np.load(dataArrayPath, allow_pickle=True)
678        labelArray = np.load(labelArrayPath, allow_pickle=True)
679
680        outDf = gpd.read_file(dfOutPath)
681
682    return dataArray, labelArray, outDf

Preprocesses a GeoDataFrame by importing, filtering, joining, and transforming data columns and labels for training a convolutional neural network.

Required Args:

path (str or list): File path to a shapefile, directory of shapefiles, or a list of file paths to shapefiles. All shapefiles must contain a dataColumn and a labelColumn. dataColumn (str): A string matching the column name for the column containing filepaths to dem grids or images. roiWidth (int): The width of the raster grids. Any rasters with dimensions that are not roiWidth x roiWidth will be removed. labelColumn (str): A string matching the column name for the label column containing labels associated with your data column. labelLookupPath (str): The path to a lookup dictionary that maps integer values to string labels. Alternatively this function can make a lookup at this path if one does not currently exist. label (str): The label used to uniquely identify the output files. saveDirectory (str): The path to the desired output directory.

Optional Args:

dataType (str, optional): String describing the type of data in the data column. Options: 'dem', 'image1' or 'image3'. Defaults to 'dem'. labelsToRemove (str or list): Label or list of labels to be removed from the GeoDataFrame. Defaults to None. scalingMethod (str, optional): The normalization and rescaling method.

  • 'min_max': Min-max scaling method. Will result in a grid with values ranging from 0 to 1.
  • 'z_score': Z-score scaling method. Will transform grids into a standard normal distribution with a mean of 0 and a standard deviation of 1 (Default: 'z_score_scaling') doApplyGaussianFilter (bool, optional): Specifies if a gaussian filter should be applied to grids. This is essentially a bluring/averaging filter that can help remove irregularities/artificats in data. Defaults to True. sigma (float, optional): The sigma value used for the gaussian filter. Defaults to 0.5. evenlyDistributeDf (bool, optional): When true, will remove some columns from the dataframe to create an even distribution of labels. Defaults to False. subset (int, optional): When a number is specified, columns will be removed randomly to produce a dataframe of this size.
Returns:

tuple: A tuple containing: - dataArray (np.ndarray): The processed data array. - labelArray (np.ndarray): The processed label array. - outDf (gpd.GeoDataFrame): The processed GeoDataFrame with the original raster paths, labels and geometry with an additional column containing integer mappings.