lidar_labeler.preprocessing
A collection of functions that facilitating preprocesing a database built with the label builder for training a convolutional neural network algorithm Run with llb venv
1""" 2A collection of functions that facilitating preprocesing a database built with the label builder for training a convolutional neural network algorithm 3Run with llb venv 4""" 5#import packages 6import sys 7import os 8# scriptDir = os.path.dirname(os.path.abspath(__file__)) 9# parentDir = os.path.dirname(scriptDir) 10# sys.path.append(parentDir) 11# from lidar_labeler import labeler_tools as lt 12from lidar_labeler import labeler_tools as lt 13 14import geopandas as gpd 15import pandas as pd 16import numpy as np 17import json 18from PIL import Image 19from osgeo import gdal 20from scipy.ndimage import gaussian_filter 21# from skimage.transform import rotate 22gdal.UseExceptions() 23from pathlib import Path 24 25# global_vars = os.path.join(parentDir, 'configs', 'global_variables.json') 26 27# with open(global_vars, 'r') as f: 28# params_dict = json.load(f) 29 30with (Path(__file__).resolve().parent.parent / 'configs' / 'global_variables.json').open('r') as f: 31 params_dict = json.load(f) 32 33# Access the variables 34CLIPPED_ROI_FOLDER = params_dict['CLIPPED_ROI_FOLDER'] 35IMG_FOLDER = params_dict['IMG_FOLDER'] 36ROIDF_RSTR_COL_PATTERN = params_dict['ROIDF_RSTR_COL_PATTERN'] 37ROIDF_IMG_COL_PATTERN = params_dict['ROIDF_IMG_COL_PATTERN'] 38 39LABEL_FILE_PATTERN = params_dict['LABEL_FILE_PATTERN'] 40LABEL_FILE_EXT = params_dict['LABEL_FILE_EXT'] 41Y_COL_LABEL = params_dict['Y_COL_LABEL'] 42X_COL_LABEL = params_dict['X_COL_LABEL'] 43 44 45def load_df(filePath:str, checkFilePaths:bool=False): 46 """Loads df from filepaths. Checks df to see if filepaths within the df exist and attempts recovery within the database if not. 47 48 Args: 49 filePath (str): Path to geodatabase where clipped rasters and labels are stored. 50 checkFilePaths (bool): Whether to check if filepaths within the df exist. Default to False. 51 52 Returns: 53 df (gpd.GeoDataFrame): A GeoDataFrame loaded from the filepath containing ROI polygons and file paths to associated clipped DEMs. 54 """ 55 df = gpd.read_file(filePath, truncation=False) 56 recoveryAttempted = False # Flag to track whether recovery attempt has been made 57 recoverySuccessful = False 58 59 # Get raster and image column names 60 rasterCols = [col for col in df.columns if col.startswith(ROIDF_RSTR_COL_PATTERN)] 61 imageCols = [col for col in df.columns if col == ROIDF_IMG_COL_PATTERN] 62 63 64 if checkFilePaths: 65 #The following loops through the rows and raster columns and checks if raster/image paths are valid. If not, will attempt to find filepaths within the 66 #df directory following the directory structure of the lidar label builder. 67 for index, row in df.iterrows(): 68 for col in rasterCols: 69 path = row[col] 70 if not os.path.exists(path): 71 if not recoveryAttempted: 72 print(f'Unable to locate some filepaths. Attempting to recover filepaths within the specified directory.') 73 recoveryAttempted = True 74 directory = os.path.split(filePath)[0] 75 fname = os.path.split(path)[1] 76 tryPath = os.path.join(directory, CLIPPED_ROI_FOLDER, fname) 77 if os.path.exists(tryPath): 78 df.at[index, col] = tryPath 79 recoverySuccessful=True 80 81 else: 82 raise ValueError(f"Unable to locate necessary filepaths.\nAttempted Path 1: {path}\nAttempted Path 2: {tryPath}") 83 for col in imageCols: 84 path = row[col] 85 if not os.path.exists(path): 86 if not recoveryAttempted: 87 print(f'Unable to locate some filepaths. Attempting to recover filepaths within the specified directory.') 88 recoveryAttempted = True 89 directory = os.path.split(filePath)[0] 90 fname = os.path.split(path)[1] 91 tryPath = os.path.join(directory, IMG_FOLDER, fname) 92 if os.path.exists(tryPath): 93 df.at[index, col] = tryPath 94 recoverySuccessful = True 95 96 else: 97 raise ValueError(f"Unable to locate necessary filepaths.\nAttempted Path 1: {path}\nAttempted Path 2: {tryPath}") 98 99 #if files were recovered saves a new df file with the recovered raster/image filepaths 100 if recoverySuccessful: 101 basePath, ext = os.path.splitext(filePath) 102 savePath = f'{basePath}_recovered{ext}' 103 df.to_file(savePath) 104 105 return df 106 107def filter_df(df:gpd.GeoDataFrame, dataColumn:str, labelColumn:str, labelsToRemove=None): 108 """Filter a GeoDataFrame by keeping only specified data and label columns and removing entries based on conditions. 109 110 Args: 111 df (geopandas.GeoDataFrame): Input GeoDataFrame containing data and label columns. 112 dataColumn (str): A string matching the column name for the column containing filepaths to dem grids or images. 113 labelColumn (str): Name of the column containing labels.A string matching the column name for the label column containing labels associated with your data column. 114 labelsToRemove (str or list, optional): Label or list of labels to be removed from the GeoDataFrame. Defaults to None. 115 116 Returns: 117 filteredDf (geopandas.GeoDataFrame): Filtered GeoDataFrame with only specified data and label columns, 118 and entries removed based on specified conditions. 119 """ 120 #Remove all columns that are not data and label columns 121 colsToKeep = [dataColumn, labelColumn, 'geometry'] 122 df = df[colsToKeep] 123 124 #remove entries that have no data or that are in the list labelsToRemove 125 df = df.dropna(subset=[labelColumn]) 126 if labelsToRemove: 127 if isinstance(labelsToRemove, str): 128 labelsToRemove = [labelsToRemove] 129 filteredDf = df[~df[labelColumn].isin(labelsToRemove)] 130 else: 131 filteredDf = df.copy() 132 return filteredDf 133 134def import_filter_concat_df(path, dataColumn:str, labelColumn:str, labelsToRemove=None): 135 """ 136 Import and filter GeoDataFrames from shapefiles or directories of shapefiles. If multiple shapefiles will concatenate into one single filtered df. 137 138 Args: 139 path (str or list): File path to a shapefile, directory of shapeiles, or a list of file paths to shapefiles. All shapefiles must contain a dataColumn and a labelColumn. 140 dataColumn (str): A string matching the column name for the column containing filepaths to dem grids or images. 141 labelColumn (str): A string matching the column name for the label column containing labels associated with your data column. 142 labelsToRemove (str or list, optional): Label or list of labels to be removed from the GeoDataFrame. Defaults to None. 143 144 Returns: 145 filteredDf (gpd.GeoDataFrame): A compiled geopandas GeoDataFrame with the columns associated with data, labels, and geometry with any columns containng labels matching a value in the 146 labelsToRemove argument removed from the dataframe. 147 148 Raises: 149 ValueError: If the input 'path' is of an unsupported type or if the file type or directory structure is not valid. 150 """ 151 #if the input path is a string (a path to one location) 152 if isinstance(path, str): 153 if os.path.isfile(path) and LABEL_FILE_PATTERN in path and path.lower().endswith(LABEL_FILE_EXT): #If this string is a file (not a directory) and matches the label file convention specified in global variables, then load that single df 154 df = load_df(path) 155 filteredDf = filter_df(df, dataColumn, labelColumn, labelsToRemove=labelsToRemove) #drop any labels that are na or that match a label in labelsToRemove 156 elif os.path.isdir(path): #If the path is to a directory, load each individual matching the label file pattern defined in global variables and concatenate into one df 157 dfs=[] 158 labelDfs = [] 159 for root, dirs, files in os.walk(path): 160 for file in files: 161 if LABEL_FILE_PATTERN in file and file.lower().endswith(LABEL_FILE_EXT): 162 filePath=os.path.join(root, file) 163 labelDfs.append(filePath) 164 df = load_df(filePath) 165 filteredDf = filter_df(df, dataColumn, labelColumn, labelsToRemove=labelsToRemove) #drop any labels that are na or that match a label in labelsToRemove 166 dfs.append(filteredDf) 167 if len(dfs) > 1: 168 print('Joining Dataframes: ' + str(labelDfs)) 169 filteredDf = pd.concat(dfs, ignore_index=True) #join into one df 170 else: 171 filteredDf = dfs[0] #if just one df in the directory extract that single directory from the dfs list. 172 173 else: 174 raise ValueError(f"Unsupported file type or directory structure for path: {path}") 175 elif isinstance(path, list): #If given a list of filepaths to label files, load filter and join these into one 176 dfs = [] 177 for p in path: 178 if os.path.isfile(p) and p.lower().endswith('.shp'): 179 df = load_df(p) 180 filteredDf = filter_df(df, dataColumn, labelColumn, labelsToRemove=labelsToRemove) 181 dfs.append(filteredDf) 182 elif os.path.isdir(path): 183 dfs=[] 184 labelDfs = [] 185 for root, dirs, files in os.walk(path): 186 for file in files: 187 if LABEL_FILE_PATTERN in file and file.lower().endswith(LABEL_FILE_EXT): 188 filePath=os.path.join(root, file) 189 labelDfs.append(filePath) 190 df = load_df(filePath) 191 filteredDf = filter_df(df, dataColumn, labelColumn, labelsToRemove=labelsToRemove) #drop any labels that are na or that match a label in labelsToRemove 192 dfs.append(filteredDf) 193 print('Joining Dataframes: ' + str(labelDfs)) 194 filteredDf = pd.concat(dfs, ignore_index=True) #join into one 195 else: 196 print(f"Unsupported input type for {p}. Extension must be {LABEL_FILE_EXT}.") 197 198 filteredDf = pd.concat(dfs, ignore_index=True) 199 else: 200 raise ValueError(f"Unsupported input type for {path}. Must be a string or a list of strings.") 201 202 return filteredDf 203 204def make_lookup(labelLookupPath:str, compiledDf:gpd.GeoDataFrame, labelColumn:str): 205 """Creates a lookup dictionary that maps integer values to unique string labels in the label column. Note: this is done in order of appearance for the label. If you 206 wish to have specific labels mapped to specific integer values, it is best to make this lookup dictionary manually. 207 208 Args: 209 labelLookupPath (str): The path to a lookup dictionary that maps integer values to string labels. Alternatively this function can make a lookup at this path if one 210 does not currently exist. 211 compiledDf (gpd.GeoDataFrame): The compiled DataFrame containig all training data. This DataFrame should have a column with labels that you want to map to integer values. 212 labelColumn (str): A string matching the column name for the label column containing labels associated with your data column 213 """ 214 uniqueLabels = compiledDf[labelColumn].unique() #locates all unique label options in df 215 labelDict = {k:v for k,v in enumerate(uniqueLabels)} #makes a dictionary 216 with open(F"{labelLookupPath}", 'w') as f: 217 json.dump(labelDict, f) #saves to filepath 218 219def get_raster_as_grid(raster): 220 """Loads in a digital elevation model as a numpy array and converts any no-data to np.nan 221 222 Args: 223 raster (str OR gdal.Dataset): The path to a single band, gdal readable, digital elevation model OR 224 an already loaded raster dataset 225 226 Returns: 227 rasterGrid (numpy.ndarray): The elevation data stored as a grid 228 dx (float): The x coordinate spacing of the grid 229 dy (float): The y coordinate spacing of the grid 230 231 Raises: 232 Exception: Input is neither a path to a raster or a gdal.Dataset 233 """ 234 235 #If this is a raster dataset 236 if isinstance(raster,gdal.Dataset): 237 doClose = False 238 239 #If this is a file 240 elif os.path.isfile(raster): 241 #Get the raster grid 242 raster = gdal.Open(raster) 243 244 245 #Close this file after the operation completes 246 doClose = True 247 else: 248 Exception('Specified raster is neither a path to a raster or a gdal.Dataset. Please specify a valid path.') 249 250 rasterGrid = raster.ReadAsArray().astype(float) 251 NDV = raster.GetRasterBand(1).GetNoDataValue() 252 253 #Mask out NDVs as nan 254 rasterGrid[rasterGrid==NDV] = np.nan 255 256 # Grab the basic header information (xUL, dx, rot1, yUL, rot2, dy) 257 geotransform = raster.GetGeoTransform() 258 259 dx = geotransform[1] 260 dy = geotransform[-1] 261 262 if doClose: 263 raster = None #Close the raster 264 265 return rasterGrid, dx, dy 266 267def load_rasters(df:gpd.GeoDataFrame, dataColumn:str, roiWidth:int, dataType:str = 'dem'): 268 """Loads the rasters depending on the datatype of the data column. Can either be a digital elevation model, or an image (1 or 3 band) 269 270 Args: 271 df (gpd.GeoDataFrame): geopandas GeoDataFrame containing a datacolumn containing paths to digital elevation models or images. 272 dataColumn (str): A string matching the column name for the column containing filepaths to dem grids or images. 273 roiWidth (int): The width of the raster grids. 274 dataType (str, optional): String describing the type of data in the data column. Options: 'dem', 'image1' or 'image3'. Defaults to 'dem'. 275 276 Returns: 277 dfLoadedRasters (gpd.GeoDataFrame): A geopandas GeoDataFrame with a new column containing the loaded raster data. 278 """ 279 dataPaths = df[dataColumn].tolist() 280 grids = [] 281 282 for path in dataPaths: 283 try: 284 if dataType == 'dem': 285 grid = get_raster_as_grid(path)[0] 286 287 elif dataType == 'image1': 288 grid = np.asarray(Image.open(path).convert('L')) 289 290 elif dataType == 'image3': 291 grid = np.asarray(Image.open(path).convert('RGB')) 292 293 else: 294 raise(ValueError('Incorrect data type specified. Options: raster or image')) 295 except Exception as e: 296 print(f'Error processing {path}: {e}. A Nan grid will be used in its place. ') 297 grid = np.full((roiWidth, roiWidth), np.nan) 298 grids.append(grid) 299 300 df[X_COL_LABEL] = grids #place loaded grids into the X column with a label specified in global variables. 301 302 dfLoadedRasters = df.copy() 303 304 return dfLoadedRasters 305 306def rm_invalid_raster_rows(dfLoadedRasters, roiWidth:int, rasterCol:str=None, idx:int=None): 307 """Removes any rows with nan, infinite, or negative values or having a standard deviation of 0 in a dataframe with a column of loaded raster arrays. 308 309 Args: 310 dfLoadedRasters (pd.DataFrame or gpd.GeoDataFrame): Dataframe or GeoDataFrame containing raster data loded as arrays. 311 roiWidth (int): The width of the raster grids. Any rasters with dimensions that are not roiWidth x roiWidth will be removed. 312 rasterCol (str): The label for the column containing raster arrays 313 idx (int): If using this function to preprocess the rasters for a gridded database that is stored in a larger database, the index value 314 can be specified to be included in printout. Defaults to None. 315 Returns: 316 dfNanRm (pd.DataFrame or gpd.GeoDataFrame): Updated DataFrame after handling NaN values. 317 """ 318 if not rasterCol: 319 rasterCol = X_COL_LABEL 320 321 dfNanRm = dfLoadedRasters.copy() 322 323 # Initialize counters for each condition 324 countNan = 0 325 countInf = 0 326 countNegative = 0 327 countZeroStdv = 0 328 countInvalidShape = 0 329 330 rowsToDrop = [] 331 for i, row in dfLoadedRasters.iterrows(): 332 dataArray = row[rasterCol] 333 containsNan = np.any(np.isnan(dataArray)) 334 containsInf = np.any(np.isinf(dataArray)) 335 stdv = np.nanstd(dataArray) 336 containsNegative = np.any(dataArray < 0) 337 invalidShape = dataArray.shape != (roiWidth, roiWidth) 338 # Check each condition and update counters 339 if containsNan: 340 countNan += 1 341 if containsInf: 342 countInf += 1 343 if containsNegative: 344 countNegative += 1 345 if stdv == 0: 346 countZeroStdv += 1 347 if invalidShape: 348 countInvalidShape += 1 349 350 351 if containsNan or containsInf or containsNegative or stdv == 0 or invalidShape: 352 rowsToDrop.append(i) 353 354 #Drop Rows with Nan vals or 0 standard deviation 355 dfNanRm.drop(rowsToDrop, inplace=True) 356 # Reset index after removing rows 357 dfNanRm = dfNanRm.reset_index(drop=True) 358 359 if len(rowsToDrop) > 0: 360 if idx: 361 print(f"The following rows were dropped for idx {idx} the respective reasons:") 362 else: 363 print("The following rows were dropped for the respective reasons:") 364 if countNan > 0: 365 print(f" - Containing NaN values: {countNan}") 366 if countInf > 0: 367 print(f" - Containing infinite values: {countInf}") 368 if countNegative > 0: 369 print(f" - Containing negative values: {countNegative}") 370 if countZeroStdv > 0: 371 print(f" - Having a standard deviation of 0: {countZeroStdv}") 372 if countInvalidShape > 0: 373 print(f" - Not having the shape ({roiWidth}, {roiWidth}): {countInvalidShape}") 374 return dfNanRm 375 376def make_even_distribution(df:gpd.GeoDataFrame, doPrint:bool=True): 377 """Adjusts the distribution of labels in a GeoDataFrame to be even by removing rows from any label that doesn't have the smallest number of labels. 378 379 Args: 380 df (gpd.GeoDataFrame): The GeoDataFrame containing the data. This DataFrame should have a column with labels that you want to distribute evenly. 381 doPrint (bool, optional): If True, prints information about the labels being removed to achieve even distribution. Defaults to True. 382 383 Returns: 384 evenlyDistributedDf (gpd.GeoDataFrame): A new GeoDataFrame with an adjusted label distribution. 385 """ 386 387 uniqueLabs = df[Y_COL_LABEL].unique() 388 minCount = df[Y_COL_LABEL].value_counts().min() 389 390 # Calculate the number of rows to remove for each label 391 excessCount = df[Y_COL_LABEL].value_counts() - minCount 392 excessCount = excessCount[excessCount > 0] 393 394 for label, count in excessCount.items(): 395 labelIndices = df[df[Y_COL_LABEL] == label].index 396 numToRemove = int(count) 397 randomIndices = np.random.choice(labelIndices, size=numToRemove, replace=False) 398 df = df.drop(randomIndices) 399 400 if doPrint: 401 print(f'Removing {numToRemove} labels with label {label} to achieve the desired distribution.') 402 403 return df 404 405 406def subset_df(df:gpd.GeoDataFrame, subsetSize:int, evenlyDistributeDf:bool = False, doPrint:bool = True): 407 """This function creates a subset of the input GeoDataFrame, optionally ensuring that the distribution of labels is even before subsetting. 408 409 Args: 410 df (gpd.GeoDataFrame): The input GeoDataFrame containing the data. This DataFrame should have a column with labels that you want to subset. 411 subsetSize (int): The desired size of the subset. 412 evenlyDistributeDf (bool, optional): If True, adjusts the DataFrame to have an even distribution of labels before subsetting. Defaults to False. 413 doPrint (bool, optional): If True, prints information about the subsetting process, including the number of samples kept for each label. Defaults to True. 414 415 Returns: 416 gpd.GeoDataFrame: A new GeoDataFrame that is a subset of the input DataFrame with the specified size and optional even label distribution. 417 """ 418 419 uniqueLabs = df[Y_COL_LABEL].unique() 420 labelCounts = df[Y_COL_LABEL].value_counts(normalize=True) 421 currentDistribution = pd.Series(labelCounts.values, index=labelCounts.index).sort_index() 422 423 424 samplesToKeep = (currentDistribution*subsetSize).astype(int) 425 indicesToKeep = [] 426 427 if evenlyDistributeDf: 428 dfToSubset = make_even_distribution(df, doPrint=False) 429 else: 430 dfToSubset = df.copy() 431 432 for label in uniqueLabs: 433 labelIndices = dfToSubset[dfToSubset[Y_COL_LABEL] == label].index.tolist() 434 subset = np.random.choice(labelIndices, samplesToKeep[label], replace = False) 435 indicesToKeep.extend(subset) 436 if doPrint: 437 print(f'Keeping {len(subset)} of {len(labelIndices)} for label: {label} to obtain a dataframe of size: {sum(samplesToKeep)}') 438 print(f'Distribution for label {label}: {round(currentDistribution[label], 2)}') 439 440 subsetDf = dfToSubset.loc[indicesToKeep] 441 442 return subsetDf 443 444def process_grids(df:gpd.GeoDataFrame, scalingMethod:str='min_max', doApplyGaussianFilter:bool = False, sigma:float=None): 445 """ Iterates through the rows of a df and processed the dem grids to specifications. This function will normalize and rescale the 446 raster grids based on the specified method (min-max or z-score). It also has the option to apply a gaussian filter and augment the 447 grids by flipping up and down and rotating +90 and -90 degrees. 448 449 Args: 450 df (gpd.GeoDataFrame): The input dataframe containing loaded raster grids to process. 451 scalingMethod (str, optional): The normalization and rescaling method. 452 - 'min_max': Min-max scaling method. Will result in a grid with values ranging from 0 to 1. 453 - 'z_score': Z-score scaling method. Will transform grids into a standard normal distribution with a mean of 0 and a standard deviation of 1 454 (Default: 'z_score_scaling') 455 doApplyGaussianFilter: Specifies if a gaussian filter will be applied to each grid. Defaults to False. 456 sigma (float, optional): Sigma value for gaussian filter. Defaults to None. If not specified but gaussian filter argument set to true will default to 1. 457 dataframe. Augmentation includes flipping each grid up and down and rotating each grid +90 and -90 degrees. The original row from of the input 458 is copied, corresponding geometry is removed and these augmented rows are joined with the input dataframe. 459 460 Returns: 461 processedDf (gpd.GeoDataFrame): The processed Df with normalized, rescaled, filtered, and augmented grids. 462 """ 463 processedDf = df.copy() 464 465 for i, row in processedDf.iterrows(): 466 grid = row[X_COL_LABEL] 467 468 #Apply a gaussian filter to slightly blur/smooth dem 469 if doApplyGaussianFilter: 470 if not sigma: 471 print("No sigma specified for gaussian filter. Will default to sigma = 1") 472 sigma = 1 473 gridToRescale = gaussian_filter(grid, sigma = sigma, mode = 'nearest') 474 else: 475 gridToRescale = grid 476 477 478 #Rescales DEMS by either min max) or zscore 479 if scalingMethod == 'min_max': 480 minVal = np.nanmin(gridToRescale) 481 maxVal = np.nanmax(gridToRescale) 482 normGrid = (gridToRescale-minVal)/(maxVal-minVal) 483 elif scalingMethod == 'z_score': 484 mean_value = np.nanmean(gridToRescale) 485 std_deviation = np.nanstd(gridToRescale) 486 normGrid = (gridToRescale - mean_value) / std_deviation 487 else: 488 raise(Warning("No scaling method specified. Grids will not be scaled or normalized.")) 489 490 processedDf.at[i, X_COL_LABEL] = normGrid 491 492 return processedDf 493 494def shuffle_df(df:gpd.GeoDataFrame): 495 """Shuffles the rows of a DataFrame. 496 497 Args: 498 df (gpd.GeoDataFrame): The input GeoDataFrame to be shuffled. 499 500 Returns: 501 shuffledDf (gpd.GeoDataFrame): A new GeoDataFrame with shuffled rows. 502 """ 503 shuffleIndices = np.random.permutation(len(df)) 504 shuffledDf = df.iloc[shuffleIndices].reset_index(drop=True) 505 return shuffledDf 506 507def make_arrays(df:gpd.GeoDataFrame, dataType:str = 'dem', dataCol:str = None, labelCol:str = None): 508 """Converts columns of a GeoDataFrame into data and label arrays for use in machine learning models. 509 510 Args: 511 df (gpd.GeoDataFrame): The input GeoDataFrame containing the data and labels. 512 dataType (str, optional): The type of data, either 'dem' or 'image1' for single band data or 'image3' for three-band data. Defaults to 'dem'. 513 dataCol (str, optional): The column name for the data. Defaults to None, in which case it uses the global variable X_COL_LABEL if available. 514 labelCol (str, optional): The column name for the labels. Defaults to None, in which case it uses the global variable Y_COL_LABEL if available. 515 516 Returns: 517 tuple: A tuple containing: 518 - dataArray (np.ndarray or None): The array of data, with shape (num_samples, nrows, ncols, nbands). 519 - labelArray (np.ndarray or None): The array of labels, with shape (num_samples, 1). 520 """ 521 522 if df.empty: 523 return None, None 524 525 if not dataCol and X_COL_LABEL in df.columns: 526 dataCol = X_COL_LABEL 527 if not labelCol and Y_COL_LABEL in df.columns: 528 labelCol = Y_COL_LABEL 529 # Initialize arrays to None 530 dataArray = None 531 labelArray = None 532 533 if dataCol: 534 nrows = df[dataCol].iloc[0].shape[0] 535 ncols = df[dataCol].iloc[0].shape[1] 536 537 if dataType == 'image3': 538 nbands = 3 539 540 elif dataType == 'image1' or 'dem': 541 nbands = 1 542 else: 543 raise ValueError("Incorrect datatype specified for dataType. Must be either 'dem', 'image3' or 'image1'.") 544 545 dataArray = np.zeros((len(df), nrows, ncols,nbands)) 546 547 for i in range(len(df)): 548 dataArray[i,:,:,:] = df[dataCol].iloc[i].reshape(nrows,ncols,nbands) 549 if labelCol: 550 labelArray = df[labelCol].values.reshape((len(df), 1)).astype(int) 551 552 return dataArray, labelArray 553 554def preprocess_df(path:str, dataColumn:str, roiWidth:int, labelColumn:str, labelLookupPath:str, label:str, saveDirectory:str, 555 dataType:str = 'dem', labelsToRemove = None, scalingMethod:str = 'z_score', 556 doApplyGaussianFilter:bool = True, sigma:float = 0.5, 557 evenlyDistributeDf:bool = False, subset:int = None 558 ): 559 """Preprocesses a GeoDataFrame by importing, filtering, joining, and transforming data columns and labels for training a convolutional neural network. 560 561 Required Args: 562 path (str or list): File path to a shapefile, directory of shapefiles, or a list of file paths to shapefiles. All shapefiles must contain a dataColumn and a labelColumn. 563 dataColumn (str): A string matching the column name for the column containing filepaths to dem grids or images. 564 roiWidth (int): The width of the raster grids. Any rasters with dimensions that are not roiWidth x roiWidth will be removed. 565 labelColumn (str): A string matching the column name for the label column containing labels associated with your data column. 566 labelLookupPath (str): The path to a lookup dictionary that maps integer values to string labels. Alternatively this function can make a lookup at this path if one 567 does not currently exist. 568 label (str): The label used to uniquely identify the output files. 569 saveDirectory (str): The path to the desired output directory. 570 Optional Args: 571 dataType (str, optional): String describing the type of data in the data column. Options: 'dem', 'image1' or 'image3'. Defaults to 'dem'. 572 labelsToRemove (str or list): Label or list of labels to be removed from the GeoDataFrame. Defaults to None. 573 scalingMethod (str, optional): The normalization and rescaling method. 574 - 'min_max': Min-max scaling method. Will result in a grid with values ranging from 0 to 1. 575 - 'z_score': Z-score scaling method. Will transform grids into a standard normal distribution with a mean of 0 and a standard deviation of 1 576 (Default: 'z_score_scaling') 577 doApplyGaussianFilter (bool, optional): Specifies if a gaussian filter should be applied to grids. This is essentially a bluring/averaging filter that can help 578 remove irregularities/artificats in data. Defaults to True. 579 sigma (float, optional): The sigma value used for the gaussian filter. Defaults to 0.5. 580 evenlyDistributeDf (bool, optional): When true, will remove some columns from the dataframe to create an even distribution of labels. Defaults to False. 581 subset (int, optional): When a number is specified, columns will be removed randomly to produce a dataframe of this size. 582 Returns: 583 tuple: A tuple containing: 584 - dataArray (np.ndarray): The processed data array. 585 - labelArray (np.ndarray): The processed label array. 586 - outDf (gpd.GeoDataFrame): The processed GeoDataFrame with the original raster paths, labels and geometry with an additional column containing integer mappings. 587 """ 588 savebaseName = os.path.join(saveDirectory, '{}_{{}}.npy'.format(label)) 589 590 dataArrayPath = savebaseName.format('data') 591 labelArrayPath = savebaseName.format('labels') 592 593 dfOutPath = os.path.join(saveDirectory, f'{label}_df.shp') 594 595 overwrite = True 596 597 if os.path.exists(dataArrayPath) and os.path.exists(labelArrayPath) and os.path.exists(dfOutPath): 598 qStr = "Existing files for data and label arrays detected. Proceed with preprocessing and overwrite existing files (y/n)?" 599 overwrite = lt._request_yn_input(qStr) 600 601 if overwrite: 602 print(f'Importing, filtering, and joining into one dataframe...') 603 compiledDf = import_filter_concat_df(path, dataColumn, labelColumn, labelsToRemove=labelsToRemove) 604 605 #connect to dictionary with labels and int vals or make one 606 if not os.path.exists(labelLookupPath): 607 qstr = "No label lookup exists at this filepath. Would you like to make one (y/n)." 608 makeLookup = lt._request_yn_input(qstr) 609 if makeLookup: 610 make_lookup(labelLookupPath, compiledDf, labelColumn) 611 else: 612 raise ValueError('No label lookup exists at filepath. Please make a lookup! :~)') 613 with open(labelLookupPath, 'r') as f: 614 labelDict = json.load(f) 615 616 #Make a column for int labels 617 labelDictRev = {labelDict[key]:key for key in labelDict} 618 compiledDf[Y_COL_LABEL] = compiledDf[labelColumn].apply(lambda val: labelDictRev[val]) 619 620 if subset: 621 print(f"Subsetting the dataframe to subset size: {subset}") 622 dfToLoad = subset_df(compiledDf, subset, evenlyDistributeDf=evenlyDistributeDf, doPrint=False) 623 else: 624 dfToLoad = compiledDf 625 626 #load rasters and rescale 627 print('Loading rasters...') 628 dfLoadedRasters = load_rasters(dfToLoad, dataColumn, roiWidth, dataType= dataType) 629 630 print('Removing entries with Nan vals...') 631 # Handle nan vals in arrays 632 dfCleaned = rm_invalid_raster_rows(dfLoadedRasters, roiWidth) 633 634 #option to remove some labels to make an even distribution 635 if evenlyDistributeDf: 636 637 distributedDf = make_even_distribution(dfCleaned) 638 else: 639 distributedDf = dfCleaned 640 641 if subset: 642 subsetDf = subset_df(distributedDf, subset) 643 else: 644 subsetDf = distributedDf 645 646 if not scalingMethod and not doApplyGaussianFilter: 647 processedDf = subsetDf 648 else: 649 print(f'Processing grids with:\nscaling method: {scalingMethod}\ngaussian filter: {doApplyGaussianFilter}, sigma: {sigma}') 650 processedDf = process_grids(subsetDf, scalingMethod=scalingMethod, doApplyGaussianFilter=doApplyGaussianFilter, sigma=sigma) 651 652 #Shuffles Df 653 shuffledDf = shuffle_df(processedDf) 654 655 print('Converting data and labels to arrays...') 656 dataArray, labelArray = make_arrays(shuffledDf, dataType = dataType) 657 658 if dataArray is not None and labelArray is not None: 659 #Save Data and Label Arrays 660 np.save(dataArrayPath, dataArray) 661 np.save(labelArrayPath, labelArray) 662 663 outDf = shuffledDf.drop(columns=[X_COL_LABEL]) 664 665 outDf.to_file(dfOutPath) 666 667 #Print Paths 668 print(f'Data Array Path: {dataArrayPath}\nLabel Array Path: {labelArrayPath}\nDataframe Path: {dfOutPath}') 669 else: 670 print("All df rows removed during preprocessing due to invalid raster data. No arrays or df's saved.") 671 return None, None, None 672 673 else: 674 #Load df from existing file 675 print(f'Loading from preexisting files.\nData Array Path: {dataArrayPath}\nLabel Array Path: {labelArrayPath}\nDf Path: {dfOutPath}') 676 dataArray = np.load(dataArrayPath, allow_pickle=True) 677 labelArray = np.load(labelArrayPath, allow_pickle=True) 678 679 outDf = gpd.read_file(dfOutPath) 680 681 return dataArray, labelArray, outDf 682 683 684 685if __name__ == '__main__': 686 import sys 687 import json 688 689 # Load parameters from the JSON file 690 params = sys.argv[1] 691 with open(params, 'r') as f: 692 params_dict = json.load(f) 693 694 dataArray, labelArray, outDf = preprocess_df(**params_dict) 695
46def load_df(filePath:str, checkFilePaths:bool=False): 47 """Loads df from filepaths. Checks df to see if filepaths within the df exist and attempts recovery within the database if not. 48 49 Args: 50 filePath (str): Path to geodatabase where clipped rasters and labels are stored. 51 checkFilePaths (bool): Whether to check if filepaths within the df exist. Default to False. 52 53 Returns: 54 df (gpd.GeoDataFrame): A GeoDataFrame loaded from the filepath containing ROI polygons and file paths to associated clipped DEMs. 55 """ 56 df = gpd.read_file(filePath, truncation=False) 57 recoveryAttempted = False # Flag to track whether recovery attempt has been made 58 recoverySuccessful = False 59 60 # Get raster and image column names 61 rasterCols = [col for col in df.columns if col.startswith(ROIDF_RSTR_COL_PATTERN)] 62 imageCols = [col for col in df.columns if col == ROIDF_IMG_COL_PATTERN] 63 64 65 if checkFilePaths: 66 #The following loops through the rows and raster columns and checks if raster/image paths are valid. If not, will attempt to find filepaths within the 67 #df directory following the directory structure of the lidar label builder. 68 for index, row in df.iterrows(): 69 for col in rasterCols: 70 path = row[col] 71 if not os.path.exists(path): 72 if not recoveryAttempted: 73 print(f'Unable to locate some filepaths. Attempting to recover filepaths within the specified directory.') 74 recoveryAttempted = True 75 directory = os.path.split(filePath)[0] 76 fname = os.path.split(path)[1] 77 tryPath = os.path.join(directory, CLIPPED_ROI_FOLDER, fname) 78 if os.path.exists(tryPath): 79 df.at[index, col] = tryPath 80 recoverySuccessful=True 81 82 else: 83 raise ValueError(f"Unable to locate necessary filepaths.\nAttempted Path 1: {path}\nAttempted Path 2: {tryPath}") 84 for col in imageCols: 85 path = row[col] 86 if not os.path.exists(path): 87 if not recoveryAttempted: 88 print(f'Unable to locate some filepaths. Attempting to recover filepaths within the specified directory.') 89 recoveryAttempted = True 90 directory = os.path.split(filePath)[0] 91 fname = os.path.split(path)[1] 92 tryPath = os.path.join(directory, IMG_FOLDER, fname) 93 if os.path.exists(tryPath): 94 df.at[index, col] = tryPath 95 recoverySuccessful = True 96 97 else: 98 raise ValueError(f"Unable to locate necessary filepaths.\nAttempted Path 1: {path}\nAttempted Path 2: {tryPath}") 99 100 #if files were recovered saves a new df file with the recovered raster/image filepaths 101 if recoverySuccessful: 102 basePath, ext = os.path.splitext(filePath) 103 savePath = f'{basePath}_recovered{ext}' 104 df.to_file(savePath) 105 106 return df
Loads df from filepaths. Checks df to see if filepaths within the df exist and attempts recovery within the database if not.
Arguments:
- filePath (str): Path to geodatabase where clipped rasters and labels are stored.
- checkFilePaths (bool): Whether to check if filepaths within the df exist. Default to False.
Returns:
df (gpd.GeoDataFrame): A GeoDataFrame loaded from the filepath containing ROI polygons and file paths to associated clipped DEMs.
108def filter_df(df:gpd.GeoDataFrame, dataColumn:str, labelColumn:str, labelsToRemove=None): 109 """Filter a GeoDataFrame by keeping only specified data and label columns and removing entries based on conditions. 110 111 Args: 112 df (geopandas.GeoDataFrame): Input GeoDataFrame containing data and label columns. 113 dataColumn (str): A string matching the column name for the column containing filepaths to dem grids or images. 114 labelColumn (str): Name of the column containing labels.A string matching the column name for the label column containing labels associated with your data column. 115 labelsToRemove (str or list, optional): Label or list of labels to be removed from the GeoDataFrame. Defaults to None. 116 117 Returns: 118 filteredDf (geopandas.GeoDataFrame): Filtered GeoDataFrame with only specified data and label columns, 119 and entries removed based on specified conditions. 120 """ 121 #Remove all columns that are not data and label columns 122 colsToKeep = [dataColumn, labelColumn, 'geometry'] 123 df = df[colsToKeep] 124 125 #remove entries that have no data or that are in the list labelsToRemove 126 df = df.dropna(subset=[labelColumn]) 127 if labelsToRemove: 128 if isinstance(labelsToRemove, str): 129 labelsToRemove = [labelsToRemove] 130 filteredDf = df[~df[labelColumn].isin(labelsToRemove)] 131 else: 132 filteredDf = df.copy() 133 return filteredDf
Filter a GeoDataFrame by keeping only specified data and label columns and removing entries based on conditions.
Arguments:
- df (geopandas.GeoDataFrame): Input GeoDataFrame containing data and label columns.
- dataColumn (str): A string matching the column name for the column containing filepaths to dem grids or images.
- labelColumn (str): Name of the column containing labels.A string matching the column name for the label column containing labels associated with your data column.
- labelsToRemove (str or list, optional): Label or list of labels to be removed from the GeoDataFrame. Defaults to None.
Returns:
filteredDf (geopandas.GeoDataFrame): Filtered GeoDataFrame with only specified data and label columns, and entries removed based on specified conditions.
135def import_filter_concat_df(path, dataColumn:str, labelColumn:str, labelsToRemove=None): 136 """ 137 Import and filter GeoDataFrames from shapefiles or directories of shapefiles. If multiple shapefiles will concatenate into one single filtered df. 138 139 Args: 140 path (str or list): File path to a shapefile, directory of shapeiles, or a list of file paths to shapefiles. All shapefiles must contain a dataColumn and a labelColumn. 141 dataColumn (str): A string matching the column name for the column containing filepaths to dem grids or images. 142 labelColumn (str): A string matching the column name for the label column containing labels associated with your data column. 143 labelsToRemove (str or list, optional): Label or list of labels to be removed from the GeoDataFrame. Defaults to None. 144 145 Returns: 146 filteredDf (gpd.GeoDataFrame): A compiled geopandas GeoDataFrame with the columns associated with data, labels, and geometry with any columns containng labels matching a value in the 147 labelsToRemove argument removed from the dataframe. 148 149 Raises: 150 ValueError: If the input 'path' is of an unsupported type or if the file type or directory structure is not valid. 151 """ 152 #if the input path is a string (a path to one location) 153 if isinstance(path, str): 154 if os.path.isfile(path) and LABEL_FILE_PATTERN in path and path.lower().endswith(LABEL_FILE_EXT): #If this string is a file (not a directory) and matches the label file convention specified in global variables, then load that single df 155 df = load_df(path) 156 filteredDf = filter_df(df, dataColumn, labelColumn, labelsToRemove=labelsToRemove) #drop any labels that are na or that match a label in labelsToRemove 157 elif os.path.isdir(path): #If the path is to a directory, load each individual matching the label file pattern defined in global variables and concatenate into one df 158 dfs=[] 159 labelDfs = [] 160 for root, dirs, files in os.walk(path): 161 for file in files: 162 if LABEL_FILE_PATTERN in file and file.lower().endswith(LABEL_FILE_EXT): 163 filePath=os.path.join(root, file) 164 labelDfs.append(filePath) 165 df = load_df(filePath) 166 filteredDf = filter_df(df, dataColumn, labelColumn, labelsToRemove=labelsToRemove) #drop any labels that are na or that match a label in labelsToRemove 167 dfs.append(filteredDf) 168 if len(dfs) > 1: 169 print('Joining Dataframes: ' + str(labelDfs)) 170 filteredDf = pd.concat(dfs, ignore_index=True) #join into one df 171 else: 172 filteredDf = dfs[0] #if just one df in the directory extract that single directory from the dfs list. 173 174 else: 175 raise ValueError(f"Unsupported file type or directory structure for path: {path}") 176 elif isinstance(path, list): #If given a list of filepaths to label files, load filter and join these into one 177 dfs = [] 178 for p in path: 179 if os.path.isfile(p) and p.lower().endswith('.shp'): 180 df = load_df(p) 181 filteredDf = filter_df(df, dataColumn, labelColumn, labelsToRemove=labelsToRemove) 182 dfs.append(filteredDf) 183 elif os.path.isdir(path): 184 dfs=[] 185 labelDfs = [] 186 for root, dirs, files in os.walk(path): 187 for file in files: 188 if LABEL_FILE_PATTERN in file and file.lower().endswith(LABEL_FILE_EXT): 189 filePath=os.path.join(root, file) 190 labelDfs.append(filePath) 191 df = load_df(filePath) 192 filteredDf = filter_df(df, dataColumn, labelColumn, labelsToRemove=labelsToRemove) #drop any labels that are na or that match a label in labelsToRemove 193 dfs.append(filteredDf) 194 print('Joining Dataframes: ' + str(labelDfs)) 195 filteredDf = pd.concat(dfs, ignore_index=True) #join into one 196 else: 197 print(f"Unsupported input type for {p}. Extension must be {LABEL_FILE_EXT}.") 198 199 filteredDf = pd.concat(dfs, ignore_index=True) 200 else: 201 raise ValueError(f"Unsupported input type for {path}. Must be a string or a list of strings.") 202 203 return filteredDf
Import and filter GeoDataFrames from shapefiles or directories of shapefiles. If multiple shapefiles will concatenate into one single filtered df.
Arguments:
- path (str or list): File path to a shapefile, directory of shapeiles, or a list of file paths to shapefiles. All shapefiles must contain a dataColumn and a labelColumn.
- dataColumn (str): A string matching the column name for the column containing filepaths to dem grids or images.
- labelColumn (str): A string matching the column name for the label column containing labels associated with your data column.
- labelsToRemove (str or list, optional): Label or list of labels to be removed from the GeoDataFrame. Defaults to None.
Returns:
filteredDf (gpd.GeoDataFrame): A compiled geopandas GeoDataFrame with the columns associated with data, labels, and geometry with any columns containng labels matching a value in the labelsToRemove argument removed from the dataframe.
Raises:
- ValueError: If the input 'path' is of an unsupported type or if the file type or directory structure is not valid.
205def make_lookup(labelLookupPath:str, compiledDf:gpd.GeoDataFrame, labelColumn:str): 206 """Creates a lookup dictionary that maps integer values to unique string labels in the label column. Note: this is done in order of appearance for the label. If you 207 wish to have specific labels mapped to specific integer values, it is best to make this lookup dictionary manually. 208 209 Args: 210 labelLookupPath (str): The path to a lookup dictionary that maps integer values to string labels. Alternatively this function can make a lookup at this path if one 211 does not currently exist. 212 compiledDf (gpd.GeoDataFrame): The compiled DataFrame containig all training data. This DataFrame should have a column with labels that you want to map to integer values. 213 labelColumn (str): A string matching the column name for the label column containing labels associated with your data column 214 """ 215 uniqueLabels = compiledDf[labelColumn].unique() #locates all unique label options in df 216 labelDict = {k:v for k,v in enumerate(uniqueLabels)} #makes a dictionary 217 with open(F"{labelLookupPath}", 'w') as f: 218 json.dump(labelDict, f) #saves to filepath
Creates a lookup dictionary that maps integer values to unique string labels in the label column. Note: this is done in order of appearance for the label. If you wish to have specific labels mapped to specific integer values, it is best to make this lookup dictionary manually.
Arguments:
- labelLookupPath (str): The path to a lookup dictionary that maps integer values to string labels. Alternatively this function can make a lookup at this path if one
- does not currently exist.
- compiledDf (gpd.GeoDataFrame): The compiled DataFrame containig all training data. This DataFrame should have a column with labels that you want to map to integer values.
- labelColumn (str): A string matching the column name for the label column containing labels associated with your data column
220def get_raster_as_grid(raster): 221 """Loads in a digital elevation model as a numpy array and converts any no-data to np.nan 222 223 Args: 224 raster (str OR gdal.Dataset): The path to a single band, gdal readable, digital elevation model OR 225 an already loaded raster dataset 226 227 Returns: 228 rasterGrid (numpy.ndarray): The elevation data stored as a grid 229 dx (float): The x coordinate spacing of the grid 230 dy (float): The y coordinate spacing of the grid 231 232 Raises: 233 Exception: Input is neither a path to a raster or a gdal.Dataset 234 """ 235 236 #If this is a raster dataset 237 if isinstance(raster,gdal.Dataset): 238 doClose = False 239 240 #If this is a file 241 elif os.path.isfile(raster): 242 #Get the raster grid 243 raster = gdal.Open(raster) 244 245 246 #Close this file after the operation completes 247 doClose = True 248 else: 249 Exception('Specified raster is neither a path to a raster or a gdal.Dataset. Please specify a valid path.') 250 251 rasterGrid = raster.ReadAsArray().astype(float) 252 NDV = raster.GetRasterBand(1).GetNoDataValue() 253 254 #Mask out NDVs as nan 255 rasterGrid[rasterGrid==NDV] = np.nan 256 257 # Grab the basic header information (xUL, dx, rot1, yUL, rot2, dy) 258 geotransform = raster.GetGeoTransform() 259 260 dx = geotransform[1] 261 dy = geotransform[-1] 262 263 if doClose: 264 raster = None #Close the raster 265 266 return rasterGrid, dx, dy
Loads in a digital elevation model as a numpy array and converts any no-data to np.nan
Arguments:
- raster (str OR gdal.Dataset): The path to a single band, gdal readable, digital elevation model OR an already loaded raster dataset
Returns:
rasterGrid (numpy.ndarray): The elevation data stored as a grid dx (float): The x coordinate spacing of the grid dy (float): The y coordinate spacing of the grid
Raises:
- Exception: Input is neither a path to a raster or a gdal.Dataset
268def load_rasters(df:gpd.GeoDataFrame, dataColumn:str, roiWidth:int, dataType:str = 'dem'): 269 """Loads the rasters depending on the datatype of the data column. Can either be a digital elevation model, or an image (1 or 3 band) 270 271 Args: 272 df (gpd.GeoDataFrame): geopandas GeoDataFrame containing a datacolumn containing paths to digital elevation models or images. 273 dataColumn (str): A string matching the column name for the column containing filepaths to dem grids or images. 274 roiWidth (int): The width of the raster grids. 275 dataType (str, optional): String describing the type of data in the data column. Options: 'dem', 'image1' or 'image3'. Defaults to 'dem'. 276 277 Returns: 278 dfLoadedRasters (gpd.GeoDataFrame): A geopandas GeoDataFrame with a new column containing the loaded raster data. 279 """ 280 dataPaths = df[dataColumn].tolist() 281 grids = [] 282 283 for path in dataPaths: 284 try: 285 if dataType == 'dem': 286 grid = get_raster_as_grid(path)[0] 287 288 elif dataType == 'image1': 289 grid = np.asarray(Image.open(path).convert('L')) 290 291 elif dataType == 'image3': 292 grid = np.asarray(Image.open(path).convert('RGB')) 293 294 else: 295 raise(ValueError('Incorrect data type specified. Options: raster or image')) 296 except Exception as e: 297 print(f'Error processing {path}: {e}. A Nan grid will be used in its place. ') 298 grid = np.full((roiWidth, roiWidth), np.nan) 299 grids.append(grid) 300 301 df[X_COL_LABEL] = grids #place loaded grids into the X column with a label specified in global variables. 302 303 dfLoadedRasters = df.copy() 304 305 return dfLoadedRasters
Loads the rasters depending on the datatype of the data column. Can either be a digital elevation model, or an image (1 or 3 band)
Arguments:
- df (gpd.GeoDataFrame): geopandas GeoDataFrame containing a datacolumn containing paths to digital elevation models or images.
- dataColumn (str): A string matching the column name for the column containing filepaths to dem grids or images.
- roiWidth (int): The width of the raster grids.
- dataType (str, optional): String describing the type of data in the data column. Options: 'dem', 'image1' or 'image3'. Defaults to 'dem'.
Returns:
dfLoadedRasters (gpd.GeoDataFrame): A geopandas GeoDataFrame with a new column containing the loaded raster data.
307def rm_invalid_raster_rows(dfLoadedRasters, roiWidth:int, rasterCol:str=None, idx:int=None): 308 """Removes any rows with nan, infinite, or negative values or having a standard deviation of 0 in a dataframe with a column of loaded raster arrays. 309 310 Args: 311 dfLoadedRasters (pd.DataFrame or gpd.GeoDataFrame): Dataframe or GeoDataFrame containing raster data loded as arrays. 312 roiWidth (int): The width of the raster grids. Any rasters with dimensions that are not roiWidth x roiWidth will be removed. 313 rasterCol (str): The label for the column containing raster arrays 314 idx (int): If using this function to preprocess the rasters for a gridded database that is stored in a larger database, the index value 315 can be specified to be included in printout. Defaults to None. 316 Returns: 317 dfNanRm (pd.DataFrame or gpd.GeoDataFrame): Updated DataFrame after handling NaN values. 318 """ 319 if not rasterCol: 320 rasterCol = X_COL_LABEL 321 322 dfNanRm = dfLoadedRasters.copy() 323 324 # Initialize counters for each condition 325 countNan = 0 326 countInf = 0 327 countNegative = 0 328 countZeroStdv = 0 329 countInvalidShape = 0 330 331 rowsToDrop = [] 332 for i, row in dfLoadedRasters.iterrows(): 333 dataArray = row[rasterCol] 334 containsNan = np.any(np.isnan(dataArray)) 335 containsInf = np.any(np.isinf(dataArray)) 336 stdv = np.nanstd(dataArray) 337 containsNegative = np.any(dataArray < 0) 338 invalidShape = dataArray.shape != (roiWidth, roiWidth) 339 # Check each condition and update counters 340 if containsNan: 341 countNan += 1 342 if containsInf: 343 countInf += 1 344 if containsNegative: 345 countNegative += 1 346 if stdv == 0: 347 countZeroStdv += 1 348 if invalidShape: 349 countInvalidShape += 1 350 351 352 if containsNan or containsInf or containsNegative or stdv == 0 or invalidShape: 353 rowsToDrop.append(i) 354 355 #Drop Rows with Nan vals or 0 standard deviation 356 dfNanRm.drop(rowsToDrop, inplace=True) 357 # Reset index after removing rows 358 dfNanRm = dfNanRm.reset_index(drop=True) 359 360 if len(rowsToDrop) > 0: 361 if idx: 362 print(f"The following rows were dropped for idx {idx} the respective reasons:") 363 else: 364 print("The following rows were dropped for the respective reasons:") 365 if countNan > 0: 366 print(f" - Containing NaN values: {countNan}") 367 if countInf > 0: 368 print(f" - Containing infinite values: {countInf}") 369 if countNegative > 0: 370 print(f" - Containing negative values: {countNegative}") 371 if countZeroStdv > 0: 372 print(f" - Having a standard deviation of 0: {countZeroStdv}") 373 if countInvalidShape > 0: 374 print(f" - Not having the shape ({roiWidth}, {roiWidth}): {countInvalidShape}") 375 return dfNanRm
Removes any rows with nan, infinite, or negative values or having a standard deviation of 0 in a dataframe with a column of loaded raster arrays.
Arguments:
- dfLoadedRasters (pd.DataFrame or gpd.GeoDataFrame): Dataframe or GeoDataFrame containing raster data loded as arrays.
- roiWidth (int): The width of the raster grids. Any rasters with dimensions that are not roiWidth x roiWidth will be removed.
- rasterCol (str): The label for the column containing raster arrays
- idx (int): If using this function to preprocess the rasters for a gridded database that is stored in a larger database, the index value
- can be specified to be included in printout. Defaults to None.
Returns:
dfNanRm (pd.DataFrame or gpd.GeoDataFrame): Updated DataFrame after handling NaN values.
377def make_even_distribution(df:gpd.GeoDataFrame, doPrint:bool=True): 378 """Adjusts the distribution of labels in a GeoDataFrame to be even by removing rows from any label that doesn't have the smallest number of labels. 379 380 Args: 381 df (gpd.GeoDataFrame): The GeoDataFrame containing the data. This DataFrame should have a column with labels that you want to distribute evenly. 382 doPrint (bool, optional): If True, prints information about the labels being removed to achieve even distribution. Defaults to True. 383 384 Returns: 385 evenlyDistributedDf (gpd.GeoDataFrame): A new GeoDataFrame with an adjusted label distribution. 386 """ 387 388 uniqueLabs = df[Y_COL_LABEL].unique() 389 minCount = df[Y_COL_LABEL].value_counts().min() 390 391 # Calculate the number of rows to remove for each label 392 excessCount = df[Y_COL_LABEL].value_counts() - minCount 393 excessCount = excessCount[excessCount > 0] 394 395 for label, count in excessCount.items(): 396 labelIndices = df[df[Y_COL_LABEL] == label].index 397 numToRemove = int(count) 398 randomIndices = np.random.choice(labelIndices, size=numToRemove, replace=False) 399 df = df.drop(randomIndices) 400 401 if doPrint: 402 print(f'Removing {numToRemove} labels with label {label} to achieve the desired distribution.') 403 404 return df
Adjusts the distribution of labels in a GeoDataFrame to be even by removing rows from any label that doesn't have the smallest number of labels.
Arguments:
- df (gpd.GeoDataFrame): The GeoDataFrame containing the data. This DataFrame should have a column with labels that you want to distribute evenly.
- doPrint (bool, optional): If True, prints information about the labels being removed to achieve even distribution. Defaults to True.
Returns:
evenlyDistributedDf (gpd.GeoDataFrame): A new GeoDataFrame with an adjusted label distribution.
407def subset_df(df:gpd.GeoDataFrame, subsetSize:int, evenlyDistributeDf:bool = False, doPrint:bool = True): 408 """This function creates a subset of the input GeoDataFrame, optionally ensuring that the distribution of labels is even before subsetting. 409 410 Args: 411 df (gpd.GeoDataFrame): The input GeoDataFrame containing the data. This DataFrame should have a column with labels that you want to subset. 412 subsetSize (int): The desired size of the subset. 413 evenlyDistributeDf (bool, optional): If True, adjusts the DataFrame to have an even distribution of labels before subsetting. Defaults to False. 414 doPrint (bool, optional): If True, prints information about the subsetting process, including the number of samples kept for each label. Defaults to True. 415 416 Returns: 417 gpd.GeoDataFrame: A new GeoDataFrame that is a subset of the input DataFrame with the specified size and optional even label distribution. 418 """ 419 420 uniqueLabs = df[Y_COL_LABEL].unique() 421 labelCounts = df[Y_COL_LABEL].value_counts(normalize=True) 422 currentDistribution = pd.Series(labelCounts.values, index=labelCounts.index).sort_index() 423 424 425 samplesToKeep = (currentDistribution*subsetSize).astype(int) 426 indicesToKeep = [] 427 428 if evenlyDistributeDf: 429 dfToSubset = make_even_distribution(df, doPrint=False) 430 else: 431 dfToSubset = df.copy() 432 433 for label in uniqueLabs: 434 labelIndices = dfToSubset[dfToSubset[Y_COL_LABEL] == label].index.tolist() 435 subset = np.random.choice(labelIndices, samplesToKeep[label], replace = False) 436 indicesToKeep.extend(subset) 437 if doPrint: 438 print(f'Keeping {len(subset)} of {len(labelIndices)} for label: {label} to obtain a dataframe of size: {sum(samplesToKeep)}') 439 print(f'Distribution for label {label}: {round(currentDistribution[label], 2)}') 440 441 subsetDf = dfToSubset.loc[indicesToKeep] 442 443 return subsetDf
This function creates a subset of the input GeoDataFrame, optionally ensuring that the distribution of labels is even before subsetting.
Arguments:
- df (gpd.GeoDataFrame): The input GeoDataFrame containing the data. This DataFrame should have a column with labels that you want to subset.
- subsetSize (int): The desired size of the subset.
- evenlyDistributeDf (bool, optional): If True, adjusts the DataFrame to have an even distribution of labels before subsetting. Defaults to False.
- doPrint (bool, optional): If True, prints information about the subsetting process, including the number of samples kept for each label. Defaults to True.
Returns:
gpd.GeoDataFrame: A new GeoDataFrame that is a subset of the input DataFrame with the specified size and optional even label distribution.
445def process_grids(df:gpd.GeoDataFrame, scalingMethod:str='min_max', doApplyGaussianFilter:bool = False, sigma:float=None): 446 """ Iterates through the rows of a df and processed the dem grids to specifications. This function will normalize and rescale the 447 raster grids based on the specified method (min-max or z-score). It also has the option to apply a gaussian filter and augment the 448 grids by flipping up and down and rotating +90 and -90 degrees. 449 450 Args: 451 df (gpd.GeoDataFrame): The input dataframe containing loaded raster grids to process. 452 scalingMethod (str, optional): The normalization and rescaling method. 453 - 'min_max': Min-max scaling method. Will result in a grid with values ranging from 0 to 1. 454 - 'z_score': Z-score scaling method. Will transform grids into a standard normal distribution with a mean of 0 and a standard deviation of 1 455 (Default: 'z_score_scaling') 456 doApplyGaussianFilter: Specifies if a gaussian filter will be applied to each grid. Defaults to False. 457 sigma (float, optional): Sigma value for gaussian filter. Defaults to None. If not specified but gaussian filter argument set to true will default to 1. 458 dataframe. Augmentation includes flipping each grid up and down and rotating each grid +90 and -90 degrees. The original row from of the input 459 is copied, corresponding geometry is removed and these augmented rows are joined with the input dataframe. 460 461 Returns: 462 processedDf (gpd.GeoDataFrame): The processed Df with normalized, rescaled, filtered, and augmented grids. 463 """ 464 processedDf = df.copy() 465 466 for i, row in processedDf.iterrows(): 467 grid = row[X_COL_LABEL] 468 469 #Apply a gaussian filter to slightly blur/smooth dem 470 if doApplyGaussianFilter: 471 if not sigma: 472 print("No sigma specified for gaussian filter. Will default to sigma = 1") 473 sigma = 1 474 gridToRescale = gaussian_filter(grid, sigma = sigma, mode = 'nearest') 475 else: 476 gridToRescale = grid 477 478 479 #Rescales DEMS by either min max) or zscore 480 if scalingMethod == 'min_max': 481 minVal = np.nanmin(gridToRescale) 482 maxVal = np.nanmax(gridToRescale) 483 normGrid = (gridToRescale-minVal)/(maxVal-minVal) 484 elif scalingMethod == 'z_score': 485 mean_value = np.nanmean(gridToRescale) 486 std_deviation = np.nanstd(gridToRescale) 487 normGrid = (gridToRescale - mean_value) / std_deviation 488 else: 489 raise(Warning("No scaling method specified. Grids will not be scaled or normalized.")) 490 491 processedDf.at[i, X_COL_LABEL] = normGrid 492 493 return processedDf
Iterates through the rows of a df and processed the dem grids to specifications. This function will normalize and rescale the raster grids based on the specified method (min-max or z-score). It also has the option to apply a gaussian filter and augment the grids by flipping up and down and rotating +90 and -90 degrees.
Arguments:
- df (gpd.GeoDataFrame): The input dataframe containing loaded raster grids to process.
- scalingMethod (str, optional): The normalization and rescaling method.
- - 'min_max': Min-max scaling method. Will result in a grid with values ranging from 0 to 1.
- - 'z_score': Z-score scaling method. Will transform grids into a standard normal distribution with a mean of 0 and a standard deviation of 1
- (Default: 'z_score_scaling')
- doApplyGaussianFilter: Specifies if a gaussian filter will be applied to each grid. Defaults to False.
- sigma (float, optional): Sigma value for gaussian filter. Defaults to None. If not specified but gaussian filter argument set to true will default to 1.
- dataframe. Augmentation includes flipping each grid up and down and rotating each grid +90 and -90 degrees. The original row from of the input
- is copied, corresponding geometry is removed and these augmented rows are joined with the input dataframe.
Returns:
processedDf (gpd.GeoDataFrame): The processed Df with normalized, rescaled, filtered, and augmented grids.
495def shuffle_df(df:gpd.GeoDataFrame): 496 """Shuffles the rows of a DataFrame. 497 498 Args: 499 df (gpd.GeoDataFrame): The input GeoDataFrame to be shuffled. 500 501 Returns: 502 shuffledDf (gpd.GeoDataFrame): A new GeoDataFrame with shuffled rows. 503 """ 504 shuffleIndices = np.random.permutation(len(df)) 505 shuffledDf = df.iloc[shuffleIndices].reset_index(drop=True) 506 return shuffledDf
Shuffles the rows of a DataFrame.
Arguments:
- df (gpd.GeoDataFrame): The input GeoDataFrame to be shuffled.
Returns:
shuffledDf (gpd.GeoDataFrame): A new GeoDataFrame with shuffled rows.
508def make_arrays(df:gpd.GeoDataFrame, dataType:str = 'dem', dataCol:str = None, labelCol:str = None): 509 """Converts columns of a GeoDataFrame into data and label arrays for use in machine learning models. 510 511 Args: 512 df (gpd.GeoDataFrame): The input GeoDataFrame containing the data and labels. 513 dataType (str, optional): The type of data, either 'dem' or 'image1' for single band data or 'image3' for three-band data. Defaults to 'dem'. 514 dataCol (str, optional): The column name for the data. Defaults to None, in which case it uses the global variable X_COL_LABEL if available. 515 labelCol (str, optional): The column name for the labels. Defaults to None, in which case it uses the global variable Y_COL_LABEL if available. 516 517 Returns: 518 tuple: A tuple containing: 519 - dataArray (np.ndarray or None): The array of data, with shape (num_samples, nrows, ncols, nbands). 520 - labelArray (np.ndarray or None): The array of labels, with shape (num_samples, 1). 521 """ 522 523 if df.empty: 524 return None, None 525 526 if not dataCol and X_COL_LABEL in df.columns: 527 dataCol = X_COL_LABEL 528 if not labelCol and Y_COL_LABEL in df.columns: 529 labelCol = Y_COL_LABEL 530 # Initialize arrays to None 531 dataArray = None 532 labelArray = None 533 534 if dataCol: 535 nrows = df[dataCol].iloc[0].shape[0] 536 ncols = df[dataCol].iloc[0].shape[1] 537 538 if dataType == 'image3': 539 nbands = 3 540 541 elif dataType == 'image1' or 'dem': 542 nbands = 1 543 else: 544 raise ValueError("Incorrect datatype specified for dataType. Must be either 'dem', 'image3' or 'image1'.") 545 546 dataArray = np.zeros((len(df), nrows, ncols,nbands)) 547 548 for i in range(len(df)): 549 dataArray[i,:,:,:] = df[dataCol].iloc[i].reshape(nrows,ncols,nbands) 550 if labelCol: 551 labelArray = df[labelCol].values.reshape((len(df), 1)).astype(int) 552 553 return dataArray, labelArray
Converts columns of a GeoDataFrame into data and label arrays for use in machine learning models.
Arguments:
- df (gpd.GeoDataFrame): The input GeoDataFrame containing the data and labels.
- dataType (str, optional): The type of data, either 'dem' or 'image1' for single band data or 'image3' for three-band data. Defaults to 'dem'.
- dataCol (str, optional): The column name for the data. Defaults to None, in which case it uses the global variable X_COL_LABEL if available.
- labelCol (str, optional): The column name for the labels. Defaults to None, in which case it uses the global variable Y_COL_LABEL if available.
Returns:
tuple: A tuple containing: - dataArray (np.ndarray or None): The array of data, with shape (num_samples, nrows, ncols, nbands). - labelArray (np.ndarray or None): The array of labels, with shape (num_samples, 1).
555def preprocess_df(path:str, dataColumn:str, roiWidth:int, labelColumn:str, labelLookupPath:str, label:str, saveDirectory:str, 556 dataType:str = 'dem', labelsToRemove = None, scalingMethod:str = 'z_score', 557 doApplyGaussianFilter:bool = True, sigma:float = 0.5, 558 evenlyDistributeDf:bool = False, subset:int = None 559 ): 560 """Preprocesses a GeoDataFrame by importing, filtering, joining, and transforming data columns and labels for training a convolutional neural network. 561 562 Required Args: 563 path (str or list): File path to a shapefile, directory of shapefiles, or a list of file paths to shapefiles. All shapefiles must contain a dataColumn and a labelColumn. 564 dataColumn (str): A string matching the column name for the column containing filepaths to dem grids or images. 565 roiWidth (int): The width of the raster grids. Any rasters with dimensions that are not roiWidth x roiWidth will be removed. 566 labelColumn (str): A string matching the column name for the label column containing labels associated with your data column. 567 labelLookupPath (str): The path to a lookup dictionary that maps integer values to string labels. Alternatively this function can make a lookup at this path if one 568 does not currently exist. 569 label (str): The label used to uniquely identify the output files. 570 saveDirectory (str): The path to the desired output directory. 571 Optional Args: 572 dataType (str, optional): String describing the type of data in the data column. Options: 'dem', 'image1' or 'image3'. Defaults to 'dem'. 573 labelsToRemove (str or list): Label or list of labels to be removed from the GeoDataFrame. Defaults to None. 574 scalingMethod (str, optional): The normalization and rescaling method. 575 - 'min_max': Min-max scaling method. Will result in a grid with values ranging from 0 to 1. 576 - 'z_score': Z-score scaling method. Will transform grids into a standard normal distribution with a mean of 0 and a standard deviation of 1 577 (Default: 'z_score_scaling') 578 doApplyGaussianFilter (bool, optional): Specifies if a gaussian filter should be applied to grids. This is essentially a bluring/averaging filter that can help 579 remove irregularities/artificats in data. Defaults to True. 580 sigma (float, optional): The sigma value used for the gaussian filter. Defaults to 0.5. 581 evenlyDistributeDf (bool, optional): When true, will remove some columns from the dataframe to create an even distribution of labels. Defaults to False. 582 subset (int, optional): When a number is specified, columns will be removed randomly to produce a dataframe of this size. 583 Returns: 584 tuple: A tuple containing: 585 - dataArray (np.ndarray): The processed data array. 586 - labelArray (np.ndarray): The processed label array. 587 - outDf (gpd.GeoDataFrame): The processed GeoDataFrame with the original raster paths, labels and geometry with an additional column containing integer mappings. 588 """ 589 savebaseName = os.path.join(saveDirectory, '{}_{{}}.npy'.format(label)) 590 591 dataArrayPath = savebaseName.format('data') 592 labelArrayPath = savebaseName.format('labels') 593 594 dfOutPath = os.path.join(saveDirectory, f'{label}_df.shp') 595 596 overwrite = True 597 598 if os.path.exists(dataArrayPath) and os.path.exists(labelArrayPath) and os.path.exists(dfOutPath): 599 qStr = "Existing files for data and label arrays detected. Proceed with preprocessing and overwrite existing files (y/n)?" 600 overwrite = lt._request_yn_input(qStr) 601 602 if overwrite: 603 print(f'Importing, filtering, and joining into one dataframe...') 604 compiledDf = import_filter_concat_df(path, dataColumn, labelColumn, labelsToRemove=labelsToRemove) 605 606 #connect to dictionary with labels and int vals or make one 607 if not os.path.exists(labelLookupPath): 608 qstr = "No label lookup exists at this filepath. Would you like to make one (y/n)." 609 makeLookup = lt._request_yn_input(qstr) 610 if makeLookup: 611 make_lookup(labelLookupPath, compiledDf, labelColumn) 612 else: 613 raise ValueError('No label lookup exists at filepath. Please make a lookup! :~)') 614 with open(labelLookupPath, 'r') as f: 615 labelDict = json.load(f) 616 617 #Make a column for int labels 618 labelDictRev = {labelDict[key]:key for key in labelDict} 619 compiledDf[Y_COL_LABEL] = compiledDf[labelColumn].apply(lambda val: labelDictRev[val]) 620 621 if subset: 622 print(f"Subsetting the dataframe to subset size: {subset}") 623 dfToLoad = subset_df(compiledDf, subset, evenlyDistributeDf=evenlyDistributeDf, doPrint=False) 624 else: 625 dfToLoad = compiledDf 626 627 #load rasters and rescale 628 print('Loading rasters...') 629 dfLoadedRasters = load_rasters(dfToLoad, dataColumn, roiWidth, dataType= dataType) 630 631 print('Removing entries with Nan vals...') 632 # Handle nan vals in arrays 633 dfCleaned = rm_invalid_raster_rows(dfLoadedRasters, roiWidth) 634 635 #option to remove some labels to make an even distribution 636 if evenlyDistributeDf: 637 638 distributedDf = make_even_distribution(dfCleaned) 639 else: 640 distributedDf = dfCleaned 641 642 if subset: 643 subsetDf = subset_df(distributedDf, subset) 644 else: 645 subsetDf = distributedDf 646 647 if not scalingMethod and not doApplyGaussianFilter: 648 processedDf = subsetDf 649 else: 650 print(f'Processing grids with:\nscaling method: {scalingMethod}\ngaussian filter: {doApplyGaussianFilter}, sigma: {sigma}') 651 processedDf = process_grids(subsetDf, scalingMethod=scalingMethod, doApplyGaussianFilter=doApplyGaussianFilter, sigma=sigma) 652 653 #Shuffles Df 654 shuffledDf = shuffle_df(processedDf) 655 656 print('Converting data and labels to arrays...') 657 dataArray, labelArray = make_arrays(shuffledDf, dataType = dataType) 658 659 if dataArray is not None and labelArray is not None: 660 #Save Data and Label Arrays 661 np.save(dataArrayPath, dataArray) 662 np.save(labelArrayPath, labelArray) 663 664 outDf = shuffledDf.drop(columns=[X_COL_LABEL]) 665 666 outDf.to_file(dfOutPath) 667 668 #Print Paths 669 print(f'Data Array Path: {dataArrayPath}\nLabel Array Path: {labelArrayPath}\nDataframe Path: {dfOutPath}') 670 else: 671 print("All df rows removed during preprocessing due to invalid raster data. No arrays or df's saved.") 672 return None, None, None 673 674 else: 675 #Load df from existing file 676 print(f'Loading from preexisting files.\nData Array Path: {dataArrayPath}\nLabel Array Path: {labelArrayPath}\nDf Path: {dfOutPath}') 677 dataArray = np.load(dataArrayPath, allow_pickle=True) 678 labelArray = np.load(labelArrayPath, allow_pickle=True) 679 680 outDf = gpd.read_file(dfOutPath) 681 682 return dataArray, labelArray, outDf
Preprocesses a GeoDataFrame by importing, filtering, joining, and transforming data columns and labels for training a convolutional neural network.
Required Args:
path (str or list): File path to a shapefile, directory of shapefiles, or a list of file paths to shapefiles. All shapefiles must contain a dataColumn and a labelColumn. dataColumn (str): A string matching the column name for the column containing filepaths to dem grids or images. roiWidth (int): The width of the raster grids. Any rasters with dimensions that are not roiWidth x roiWidth will be removed. labelColumn (str): A string matching the column name for the label column containing labels associated with your data column. labelLookupPath (str): The path to a lookup dictionary that maps integer values to string labels. Alternatively this function can make a lookup at this path if one does not currently exist. label (str): The label used to uniquely identify the output files. saveDirectory (str): The path to the desired output directory.
Optional Args:
dataType (str, optional): String describing the type of data in the data column. Options: 'dem', 'image1' or 'image3'. Defaults to 'dem'. labelsToRemove (str or list): Label or list of labels to be removed from the GeoDataFrame. Defaults to None. scalingMethod (str, optional): The normalization and rescaling method.
- 'min_max': Min-max scaling method. Will result in a grid with values ranging from 0 to 1.
- 'z_score': Z-score scaling method. Will transform grids into a standard normal distribution with a mean of 0 and a standard deviation of 1 (Default: 'z_score_scaling') doApplyGaussianFilter (bool, optional): Specifies if a gaussian filter should be applied to grids. This is essentially a bluring/averaging filter that can help remove irregularities/artificats in data. Defaults to True. sigma (float, optional): The sigma value used for the gaussian filter. Defaults to 0.5. evenlyDistributeDf (bool, optional): When true, will remove some columns from the dataframe to create an even distribution of labels. Defaults to False. subset (int, optional): When a number is specified, columns will be removed randomly to produce a dataframe of this size.
Returns:
tuple: A tuple containing: - dataArray (np.ndarray): The processed data array. - labelArray (np.ndarray): The processed label array. - outDf (gpd.GeoDataFrame): The processed GeoDataFrame with the original raster paths, labels and geometry with an additional column containing integer mappings.