lidar_labeler.labeler_tools

This module provides a collection of secondary functions to assist with various tasks related to training and deploying a cnn on a labeled database of DEMs.

  1""" 
  2This module provides a collection of secondary functions to assist with various tasks related to training and 
  3deploying a cnn on a labeled database of DEMs.
  4"""
  5import sys
  6import os
  7# scriptDir = os.path.dirname(os.path.abspath(__file__))
  8# parentDir = os.path.dirname(scriptDir)
  9# sys.path.append(parentDir)
 10# from lidar_labeler import preprocessing as pp
 11from lidar_labeler import preprocessing as pp
 12
 13import json
 14import numpy as np
 15import pandas as pd
 16import matplotlib.pyplot as plt
 17import random
 18from sklearn import metrics as skmetrics
 19import geopandas as gpd
 20from sklearn.metrics import RocCurveDisplay
 21from pathlib import Path
 22
 23# global_vars = os.path.join(parentDir, 'configs', 'global_variables.json')
 24
 25# with open(global_vars, 'r') as f:
 26#     params_dict = json.load(f)
 27with (Path(__file__).resolve().parent.parent / 'configs' / 'global_variables.json').open('r') as f:
 28    params_dict = json.load(f)
 29
 30Y_COL_LABEL = params_dict['Y_COL_LABEL']
 31X_COL_LABEL = params_dict['X_COL_LABEL']
 32
 33def _request_yn_input(question: str):
 34    '''This is a helper function to get a 'Y'/'N' (e.g., yes/no, true/false)
 35    response to a question from the user via the command prompt. 
 36    
 37    It will continue to query the user untill a valid respons is issued.
 38
 39    Parameters
 40    ----------
 41    question : str
 42        The question that will be asked of the user.
 43
 44    Returns
 45    -------
 46    binaryResponse : TYPE
 47        True false based on if the user respond 'Y' (True) or 'N' (False).
 48
 49    '''
 50    response = str(input(question + '? (Y/N): ')).lower().strip()
 51
 52    if response[:1] == 'y':
 53        binaryResponse = True
 54    elif response[:1] == 'n':
 55        binaryResponse = False
 56    else:
 57        print('Whoopsy, please enter Y or N')
 58        binaryResponse = _request_yn_input()
 59
 60    return binaryResponse
 61
 62def load_json_params_preprocessing(filePath: str, print_info: bool=True):
 63    """Load parameters from a JSON file in a custom format.
 64
 65    Args:
 66        filePath (str): Path to the JSON file containing parameters.
 67        print_info (bool, optional): If True, print information about the parameters. Defaults to True.
 68
 69    Returns:
 70        dict: A dictionary containing the parameters.
 71    """
 72
 73    with open(filePath, 'r') as f:
 74        params_dict = json.load(f)
 75
 76    path = params_dict.get('path', None)
 77    dataColumn = params_dict.get('dataColumn', None)
 78    labelColumn = params_dict.get('labelColumn', None)
 79    labelsToRemove = params_dict.get('labelsToRemove', None)
 80    labelLookupPath = params_dict.get('labelLookupPath', None)
 81    label = params_dict.get('label', None)
 82    dataType = params_dict.get('dataType', None)
 83    scalingMethod = params_dict.get('scalingMethod', None)
 84    saveDirectory = params_dict.get('saveDirectory', None)
 85    doApplyGaussianFilter = params_dict.get('doApplyGaussianFilter', None)
 86    sigma = params_dict.get('sigma', None)
 87    evenlyDistributeDf = params_dict.get('evenlyDistributeDf', None)
 88    subset = params_dict.get('subset', None)
 89
 90
 91    if print_info:
 92        # Print the parameters and their types
 93        print("Path ({}): {}".format(type(path), path))
 94        print("Data Column ({}): {}".format(type(dataColumn), dataColumn))
 95        print("Label Column ({}): {}".format(type(labelColumn), labelColumn))
 96        print("Labels to Remove ({}): {}".format(type(labelsToRemove), labelsToRemove))
 97        print("Label Lookup Path ({}): {}".format(type(labelLookupPath), labelLookupPath))
 98        print("Label ({}): {}".format(type(label), label))
 99        print("Data Type ({}): {}".format(type(dataType), dataType))
100        print("Scaling Method ({}): {}".format(type(scalingMethod), scalingMethod))
101        print("Save Directory ({}): {}".format(type(saveDirectory), saveDirectory))
102        print("Do Apply Gaussian Fiter ({}): {}".format(type(doApplyGaussianFilter), doApplyGaussianFilter))
103        print("Sigma ({}): {}".format(type(sigma), sigma))
104        print("Evenly Distribute DataFrame ({}): {}".format(type(evenlyDistributeDf), evenlyDistributeDf))
105        print("Subset Df ({}): {}".format(type(subset), subset))
106
107    return params_dict
108
109def check_array_for_abnormal_values(arr:np.ndarray):
110    """
111    Check an array for NaN, infinite values, zeros, and shape consistency.
112
113    Args:
114        arr (np.ndarray): Array to be checked.
115
116    Returns:
117        dict: Dictionary containing counts of NaN, infinite, and zero values, and shape consistency.
118        
119    """
120    nanIndices = np.isnan(arr)
121    nanCount = np.sum(nanIndices)
122    
123    infIndices = np.isinf(arr)
124    infCount = np.sum(infIndices)
125    
126    zeroCount = np.sum(arr == 0)
127
128    # Check for consistency in shape
129    shapeConsistent = arr.shape[1:] == arr[0].shape
130    
131    return {
132        "NaN count": nanCount,
133        "Infinite count": infCount,
134        "Zero count": zeroCount,
135        "Shape consistent": shapeConsistent
136    }
137
138def arrays_to_dataframe(dataArray, labelArray):
139    """Convert data and label arrays to a DataFrame.
140
141    Args:
142        dataArray (np.ndarray): Array containing data.
143        labelArray (np.ndarray): Array containing labels.
144
145    Returns:
146        pd.DataFrame: DataFrame with data and labels.
147    """
148    dataArrayList = [dataArray[i] for i in range(dataArray.shape[0])]
149    labelList = [labelArray[i][0] for i in range(labelArray.shape[0])]
150
151    return pd.DataFrame({X_COL_LABEL: dataArrayList, Y_COL_LABEL: labelList})
152
153def subset_arrays(dataArray: np.ndarray, labelArray: np.ndarray, subsetSize:int):
154    """Create a subset of data and label arrays. Can be used during model development to test model performance on a random subset of data.
155
156    Args:
157        dataArray (np.ndarray): Array containing data.
158        labelArray (np.ndarray): Array containing labels.
159        subsetSize (int):  Size of the subset.
160
161    Returns:
162        tuple: Subset of data and label arrays.
163    """
164    arrayDf = arrays_to_dataframe(dataArray, labelArray)
165
166    subsetDf = pp.subset_df(arrayDf, subsetSize)
167
168    subsetDataArray, subsetLabelArray = pp.make_arrays(subsetDf, dataCol = X_COL_LABEL, labelCol = Y_COL_LABEL)
169
170    return subsetDataArray, subsetLabelArray
171
172def show_performance_curve(training_result, metrics:list, savePath:str=None, label:str=None):
173    """Plot performance curves for training and validation metrics.
174
175    Args:
176        training_result (keras.callbacks.History): The history object returned by model.fit, containing training and validation metrics.
177        metrics (list): List of metrics to plot.
178        savePath (str, optional): Path to save the performance curve plot. Defaults to None.
179        label (str, optional): Label for the plot. Defaults to None.
180    """
181    n = len(metrics)
182    _, axs = plt.subplots(1, n, figsize=(5 * n, 5))
183    for i, metric in enumerate(metrics):
184        train_perf = training_result.history[str(metric)]
185        validation_perf = training_result.history['val_' + str(metric)]
186
187        axs[i].plot(train_perf, label=metric)
188        axs[i].plot(validation_perf, label='val_' + str(metric))
189        axs[i].set_xlabel('Epoch')
190        axs[i].set_ylabel('Score')
191        axs[i].legend()
192
193    if savePath:
194        path = os.path.join(savePath, f'{label}_performance_curve.png')
195        print(f'Saving performance curve to: {path}')
196        plt.savefig(path)
197    plt.show()
198    
199
200def show_performance_curve_history_dict(history_data:dict, metrics:list, savePath:str=None, label:str=None):
201    """Plot performance curves for training and validation metrics from a history dictionary.
202
203    Args:
204        history_data (dict): Dictionary containing performance history.
205        metrics (list): List of metrics to plot.
206        savePath (str, optional): Path to save the performance curve plot. Defaults to None.
207        label (str, optional): Label for the plot. Defaults to None.
208    """
209    n = len(metrics)
210    _, axs = plt.subplots(1, n, figsize=(5 * n, 5))
211    for i, metric in enumerate(metrics):
212        train_perf = history_data[str(metric)]
213        validation_perf = history_data['val_' + str(metric)]
214
215        axs[i].plot(train_perf, label=metric)
216        axs[i].plot(validation_perf, label='val_' + str(metric))
217        axs[i].set_xlabel('Epoch')
218        axs[i].set_ylabel('Score')
219        axs[i].legend()
220
221    if savePath:
222        path = os.path.join(savePath, f'{label}_performance_curve.png')
223        print(f'Saving performance curve to: {path}')
224        plt.savefig(path)
225    plt.show()
226
227def show_roc_curve(trueLabels:np.ndarray, predLabels:np.ndarray, savePath:str=None, label:str=None):
228    """Plot ROC curve.
229
230    Args:
231        trueLabels (np.ndarray): Array of true labels.
232        predLabels (np.ndarray): Array of predicted labels.
233        savePath (str, optional):  Path to save the ROC curve plot. Defaults to None.
234        label (str, optional): Label for the plot. Defaults to None.
235    """
236    RocCurveDisplay.from_predictions(trueLabels, predLabels)
237    plt.show()
238
239    if savePath:
240        path = os.path.join(savePath, f'{label}_roc_curve.png')
241        print(f'Saving roc curve to: {path}')
242        plt.savefig(path)
243
244    
245def load_label_lookup(pathToLookup:str):
246    """Load label lookup dictionary from a JSON file.
247
248    Args:
249        pathToLookup (str):  Path to the JSON file containing label lookup.
250
251    Returns:
252        dict: Label lookup dictionary.
253    """
254    with open(pathToLookup, 'r') as f:
255        return json.load(f)
256
257    
258def show_confusion_matrix(yTest:np.ndarray, pred:np.ndarray, lookupDict:dict= None, savePath:str = None, label:str = None, normalize:str = None):
259    """Display a confusion matrix for evaluation.
260
261    Args:
262        yTest (array-like): True labels.
263        pred (array-like): Predicted labels.
264        lookupDict (dict, optional): Dictionary for mapping label indices to their actual labels. Defaults to None.
265        savePath (str, optional): Path to save the plot. Defaults to None.
266        label (str, optional): Label for the plot. Defaults to None.
267        normalize (str, optional): Type of normalization for the confusion matrix. 
268            - None: will not normalize
269            -'true': normalizes over true population (rows)
270            -'pred': normalizes over predicted population (columns)
271            -'all': normalizes to the whole population
272            Defaults to None.
273
274    Returns:
275        None
276    """
277    
278    cm = skmetrics.confusion_matrix(yTest.flatten(),pred.flatten(), normalize=normalize)
279    if lookupDict:
280        displayLabels = [lookupDict[str(label)] for label in sorted(map(int, lookupDict.keys()))]
281        disp = skmetrics.ConfusionMatrixDisplay(cm, display_labels=displayLabels)
282    else:
283        disp = skmetrics.ConfusionMatrixDisplay(cm)
284    disp.plot()
285    if savePath and label:
286        path = os.path.join(savePath, f'{label}_confusion_matrix.png')
287        print(f'Saving confusion matirix to: {path}')
288        plt.savefig(path)
289    return None
290
291def display_rand_arrays(dataArray, labelArray, n, pathToLookup=None):
292    """Display a random selection of arrays with corresponding labels. Useful when trainign a model to make sure preprocessing
293    step achieved the desired result. 
294
295    Args:
296        dataArray (array-like): Array containing data.
297        labelArray (array-like): Array containing labels.
298        n (int): Number of arrays to display.
299        pathToLookup (str, optional): Path to the JSON file containing label lookup. Defaults to None.
300
301    Returns:
302        None
303    """
304    i_vals = random.sample(range(len(dataArray)), n)
305    
306    if pathToLookup:
307    # Load the label lookup dictionary from the JSON file
308        with open(pathToLookup, 'r') as f:
309            labelLookup = json.load(f)
310        
311    for i in i_vals:
312        # Plot the grid
313        plt.imshow(dataArray[i], cmap='gray')
314        
315        # Get the corresponding label from the lookup dictionary
316        if pathToLookup:
317            label = labelLookup[str(labelArray[i][0])]  # Convert label to string if needed
318        else:
319            label = labelArray[i][0]
320        # Set the title for the plot
321        plt.title(f'Label: {label}')
322
323        plt.colorbar()  # Add a colorbar for reference
324        plt.show()  # Show the plot for each iteration
325    return None
326
327def get_arrays_from_json_params(params:str, loadDf:bool = True):
328    """Load data and label arrays from a JSON parameter file.
329
330    Args:
331        params (str): Path to the JSON parameter file.
332        loadDf (bool, optional): Whether to load a GeoDataFrame. Defaults to True.
333
334    Returns:
335        tuple: A tuple containing data array, label array, number of rows, number of columns, and GeoDataFrame (if loaded).
336
337    """
338    with open(params, 'r') as f:
339        params_dict = json.load(f)
340    saveDirectory=params_dict['saveDirectory']
341    label = params_dict['label']
342    savebaseName = os.path.join(saveDirectory, '{}_{{}}.npy'.format(label))
343                     
344    X = np.load(savebaseName.format('data'), allow_pickle=True)
345    y = np.load(savebaseName.format('labels'), allow_pickle=True)
346    nrows = X.shape[1]
347    ncols = X.shape[2]
348    print(X.shape,y.shape)
349    print('NaN Value Count in X:', np.sum(np.isnan(X))) #Check for NaN Vals
350
351    if loadDf:
352        dfPath = os.path.join(saveDirectory, f'{label}_df.shp')
353        df = gpd.read_file(dfPath)
354    else:
355         df=None
356    return X, y, nrows, ncols, df
357
358def get_arrays_from_file(label:str, saveDir:str, loadDf:bool=True):
359    """ Load data and label arrays from a file.
360
361    Args:
362        label (str): Label for the arrays.
363        saveDir (str): Directory where the arrays are saved.
364        loadDf (bool, optional): Whether to load a GeoDataFrame. Defaults to True.
365
366    Returns:
367        tuple: A tuple containing data array, label array, number of rows, number of columns, and GeoDataFrame (if loaded).
368    """
369    fname = f'{label}_{{}}.npy'
370    fbasename = os.path.join(saveDir, fname)
371    X = np.load(fbasename.format('data'), allow_pickle=True)
372    y = np.load(fbasename.format('labels'), allow_pickle=True)
373    nrows = X.shape[1]
374    ncols = X.shape[2]
375    
376    print(X.shape,y.shape)
377    print('NaN Value Count in X:', np.sum(np.isnan(X))) #Check for NaN Vals
378    if loadDf:
379        dfPath = os.path.join(saveDir, f'{label}_df.shp')
380        df = gpd.read_file(dfPath)
381    else:
382         df=None
383    return X, y, nrows, ncols, df
384
385
386def subset_processed_df(saveDirectory:str, label:str, subset:int, newLabel:str=None):
387    """Subset a processed DataFrame based on a specified label and subset.
388
389    Args:
390        saveDirectory (str): Directory where the processed data is saved.
391        label (str): Label for the data.
392        subset (int): Size of the subset.
393        newLabel (str, optional): New label for the subset. Defaults to None.
394
395    Returns:
396        tuple: A tuple containing data array, label array, and subset DataFrame.
397    """
398    #load df, label, and data arrays
399    X, y, _, _, df = get_arrays_from_file(label, saveDirectory)
400
401    #Put data and label arrays back into df
402    df[X_COL_LABEL] = [x for x in X]
403    df[Y_COL_LABEL] = [lab[0] for lab in y]
404    subsetDf = pp.subset_df(df, subset)
405
406    dataArray, labelArray = pp.make_arrays(subsetDf)
407
408    if newLabel:
409        savebaseName = os.path.join(saveDirectory, '{}_{{}}.npy'.format(newLabel))
410        dfOutPath = os.path.join(saveDirectory, f'{newLabel}_df.shp')
411    else:
412        savebaseName = os.path.join(saveDirectory, '{}_subset_{{}}.npy'.format(label))
413        dfOutPath = os.path.join(saveDirectory, f'{label}_subset_df.shp')
414
415    #Format basename for data and label arrays
416    dataArrayPath = savebaseName.format('data')
417    labelArrayPath = savebaseName.format('labels')
418
419    #Save Data and Label Arrays
420    np.save(dataArrayPath, dataArray)
421    np.save(labelArrayPath, labelArray)
422
423    #Save df
424    outDf = subsetDf.drop(columns = [X_COL_LABEL])
425    outDf.to_file(dfOutPath)
426    print(f'Data Array Path: {dataArrayPath}\nLabel Array Path: {labelArrayPath}\nDataframe Path: {dfOutPath}')
427
428    return dataArray, labelArray, subsetDf
Y_COL_LABEL = 'intLabel'
X_COL_LABEL = 'rasterData'
def load_json_params_preprocessing(filePath: str, print_info: bool = True):
 63def load_json_params_preprocessing(filePath: str, print_info: bool=True):
 64    """Load parameters from a JSON file in a custom format.
 65
 66    Args:
 67        filePath (str): Path to the JSON file containing parameters.
 68        print_info (bool, optional): If True, print information about the parameters. Defaults to True.
 69
 70    Returns:
 71        dict: A dictionary containing the parameters.
 72    """
 73
 74    with open(filePath, 'r') as f:
 75        params_dict = json.load(f)
 76
 77    path = params_dict.get('path', None)
 78    dataColumn = params_dict.get('dataColumn', None)
 79    labelColumn = params_dict.get('labelColumn', None)
 80    labelsToRemove = params_dict.get('labelsToRemove', None)
 81    labelLookupPath = params_dict.get('labelLookupPath', None)
 82    label = params_dict.get('label', None)
 83    dataType = params_dict.get('dataType', None)
 84    scalingMethod = params_dict.get('scalingMethod', None)
 85    saveDirectory = params_dict.get('saveDirectory', None)
 86    doApplyGaussianFilter = params_dict.get('doApplyGaussianFilter', None)
 87    sigma = params_dict.get('sigma', None)
 88    evenlyDistributeDf = params_dict.get('evenlyDistributeDf', None)
 89    subset = params_dict.get('subset', None)
 90
 91
 92    if print_info:
 93        # Print the parameters and their types
 94        print("Path ({}): {}".format(type(path), path))
 95        print("Data Column ({}): {}".format(type(dataColumn), dataColumn))
 96        print("Label Column ({}): {}".format(type(labelColumn), labelColumn))
 97        print("Labels to Remove ({}): {}".format(type(labelsToRemove), labelsToRemove))
 98        print("Label Lookup Path ({}): {}".format(type(labelLookupPath), labelLookupPath))
 99        print("Label ({}): {}".format(type(label), label))
100        print("Data Type ({}): {}".format(type(dataType), dataType))
101        print("Scaling Method ({}): {}".format(type(scalingMethod), scalingMethod))
102        print("Save Directory ({}): {}".format(type(saveDirectory), saveDirectory))
103        print("Do Apply Gaussian Fiter ({}): {}".format(type(doApplyGaussianFilter), doApplyGaussianFilter))
104        print("Sigma ({}): {}".format(type(sigma), sigma))
105        print("Evenly Distribute DataFrame ({}): {}".format(type(evenlyDistributeDf), evenlyDistributeDf))
106        print("Subset Df ({}): {}".format(type(subset), subset))
107
108    return params_dict

Load parameters from a JSON file in a custom format.

Arguments:
  • filePath (str): Path to the JSON file containing parameters.
  • print_info (bool, optional): If True, print information about the parameters. Defaults to True.
Returns:

dict: A dictionary containing the parameters.

def check_array_for_abnormal_values(arr: numpy.ndarray):
110def check_array_for_abnormal_values(arr:np.ndarray):
111    """
112    Check an array for NaN, infinite values, zeros, and shape consistency.
113
114    Args:
115        arr (np.ndarray): Array to be checked.
116
117    Returns:
118        dict: Dictionary containing counts of NaN, infinite, and zero values, and shape consistency.
119        
120    """
121    nanIndices = np.isnan(arr)
122    nanCount = np.sum(nanIndices)
123    
124    infIndices = np.isinf(arr)
125    infCount = np.sum(infIndices)
126    
127    zeroCount = np.sum(arr == 0)
128
129    # Check for consistency in shape
130    shapeConsistent = arr.shape[1:] == arr[0].shape
131    
132    return {
133        "NaN count": nanCount,
134        "Infinite count": infCount,
135        "Zero count": zeroCount,
136        "Shape consistent": shapeConsistent
137    }

Check an array for NaN, infinite values, zeros, and shape consistency.

Arguments:
  • arr (np.ndarray): Array to be checked.
Returns:

dict: Dictionary containing counts of NaN, infinite, and zero values, and shape consistency.

def arrays_to_dataframe(dataArray, labelArray):
139def arrays_to_dataframe(dataArray, labelArray):
140    """Convert data and label arrays to a DataFrame.
141
142    Args:
143        dataArray (np.ndarray): Array containing data.
144        labelArray (np.ndarray): Array containing labels.
145
146    Returns:
147        pd.DataFrame: DataFrame with data and labels.
148    """
149    dataArrayList = [dataArray[i] for i in range(dataArray.shape[0])]
150    labelList = [labelArray[i][0] for i in range(labelArray.shape[0])]
151
152    return pd.DataFrame({X_COL_LABEL: dataArrayList, Y_COL_LABEL: labelList})

Convert data and label arrays to a DataFrame.

Arguments:
  • dataArray (np.ndarray): Array containing data.
  • labelArray (np.ndarray): Array containing labels.
Returns:

pd.DataFrame: DataFrame with data and labels.

def subset_arrays(dataArray: numpy.ndarray, labelArray: numpy.ndarray, subsetSize: int):
154def subset_arrays(dataArray: np.ndarray, labelArray: np.ndarray, subsetSize:int):
155    """Create a subset of data and label arrays. Can be used during model development to test model performance on a random subset of data.
156
157    Args:
158        dataArray (np.ndarray): Array containing data.
159        labelArray (np.ndarray): Array containing labels.
160        subsetSize (int):  Size of the subset.
161
162    Returns:
163        tuple: Subset of data and label arrays.
164    """
165    arrayDf = arrays_to_dataframe(dataArray, labelArray)
166
167    subsetDf = pp.subset_df(arrayDf, subsetSize)
168
169    subsetDataArray, subsetLabelArray = pp.make_arrays(subsetDf, dataCol = X_COL_LABEL, labelCol = Y_COL_LABEL)
170
171    return subsetDataArray, subsetLabelArray

Create a subset of data and label arrays. Can be used during model development to test model performance on a random subset of data.

Arguments:
  • dataArray (np.ndarray): Array containing data.
  • labelArray (np.ndarray): Array containing labels.
  • subsetSize (int): Size of the subset.
Returns:

tuple: Subset of data and label arrays.

def show_performance_curve( training_result, metrics: list, savePath: str = None, label: str = None):
173def show_performance_curve(training_result, metrics:list, savePath:str=None, label:str=None):
174    """Plot performance curves for training and validation metrics.
175
176    Args:
177        training_result (keras.callbacks.History): The history object returned by model.fit, containing training and validation metrics.
178        metrics (list): List of metrics to plot.
179        savePath (str, optional): Path to save the performance curve plot. Defaults to None.
180        label (str, optional): Label for the plot. Defaults to None.
181    """
182    n = len(metrics)
183    _, axs = plt.subplots(1, n, figsize=(5 * n, 5))
184    for i, metric in enumerate(metrics):
185        train_perf = training_result.history[str(metric)]
186        validation_perf = training_result.history['val_' + str(metric)]
187
188        axs[i].plot(train_perf, label=metric)
189        axs[i].plot(validation_perf, label='val_' + str(metric))
190        axs[i].set_xlabel('Epoch')
191        axs[i].set_ylabel('Score')
192        axs[i].legend()
193
194    if savePath:
195        path = os.path.join(savePath, f'{label}_performance_curve.png')
196        print(f'Saving performance curve to: {path}')
197        plt.savefig(path)
198    plt.show()

Plot performance curves for training and validation metrics.

Arguments:
  • training_result (keras.callbacks.History): The history object returned by model.fit, containing training and validation metrics.
  • metrics (list): List of metrics to plot.
  • savePath (str, optional): Path to save the performance curve plot. Defaults to None.
  • label (str, optional): Label for the plot. Defaults to None.
def show_performance_curve_history_dict( history_data: dict, metrics: list, savePath: str = None, label: str = None):
201def show_performance_curve_history_dict(history_data:dict, metrics:list, savePath:str=None, label:str=None):
202    """Plot performance curves for training and validation metrics from a history dictionary.
203
204    Args:
205        history_data (dict): Dictionary containing performance history.
206        metrics (list): List of metrics to plot.
207        savePath (str, optional): Path to save the performance curve plot. Defaults to None.
208        label (str, optional): Label for the plot. Defaults to None.
209    """
210    n = len(metrics)
211    _, axs = plt.subplots(1, n, figsize=(5 * n, 5))
212    for i, metric in enumerate(metrics):
213        train_perf = history_data[str(metric)]
214        validation_perf = history_data['val_' + str(metric)]
215
216        axs[i].plot(train_perf, label=metric)
217        axs[i].plot(validation_perf, label='val_' + str(metric))
218        axs[i].set_xlabel('Epoch')
219        axs[i].set_ylabel('Score')
220        axs[i].legend()
221
222    if savePath:
223        path = os.path.join(savePath, f'{label}_performance_curve.png')
224        print(f'Saving performance curve to: {path}')
225        plt.savefig(path)
226    plt.show()

Plot performance curves for training and validation metrics from a history dictionary.

Arguments:
  • history_data (dict): Dictionary containing performance history.
  • metrics (list): List of metrics to plot.
  • savePath (str, optional): Path to save the performance curve plot. Defaults to None.
  • label (str, optional): Label for the plot. Defaults to None.
def show_roc_curve( trueLabels: numpy.ndarray, predLabels: numpy.ndarray, savePath: str = None, label: str = None):
228def show_roc_curve(trueLabels:np.ndarray, predLabels:np.ndarray, savePath:str=None, label:str=None):
229    """Plot ROC curve.
230
231    Args:
232        trueLabels (np.ndarray): Array of true labels.
233        predLabels (np.ndarray): Array of predicted labels.
234        savePath (str, optional):  Path to save the ROC curve plot. Defaults to None.
235        label (str, optional): Label for the plot. Defaults to None.
236    """
237    RocCurveDisplay.from_predictions(trueLabels, predLabels)
238    plt.show()
239
240    if savePath:
241        path = os.path.join(savePath, f'{label}_roc_curve.png')
242        print(f'Saving roc curve to: {path}')
243        plt.savefig(path)

Plot ROC curve.

Arguments:
  • trueLabels (np.ndarray): Array of true labels.
  • predLabels (np.ndarray): Array of predicted labels.
  • savePath (str, optional): Path to save the ROC curve plot. Defaults to None.
  • label (str, optional): Label for the plot. Defaults to None.
def load_label_lookup(pathToLookup: str):
246def load_label_lookup(pathToLookup:str):
247    """Load label lookup dictionary from a JSON file.
248
249    Args:
250        pathToLookup (str):  Path to the JSON file containing label lookup.
251
252    Returns:
253        dict: Label lookup dictionary.
254    """
255    with open(pathToLookup, 'r') as f:
256        return json.load(f)

Load label lookup dictionary from a JSON file.

Arguments:
  • pathToLookup (str): Path to the JSON file containing label lookup.
Returns:

dict: Label lookup dictionary.

def show_confusion_matrix( yTest: numpy.ndarray, pred: numpy.ndarray, lookupDict: dict = None, savePath: str = None, label: str = None, normalize: str = None):
259def show_confusion_matrix(yTest:np.ndarray, pred:np.ndarray, lookupDict:dict= None, savePath:str = None, label:str = None, normalize:str = None):
260    """Display a confusion matrix for evaluation.
261
262    Args:
263        yTest (array-like): True labels.
264        pred (array-like): Predicted labels.
265        lookupDict (dict, optional): Dictionary for mapping label indices to their actual labels. Defaults to None.
266        savePath (str, optional): Path to save the plot. Defaults to None.
267        label (str, optional): Label for the plot. Defaults to None.
268        normalize (str, optional): Type of normalization for the confusion matrix. 
269            - None: will not normalize
270            -'true': normalizes over true population (rows)
271            -'pred': normalizes over predicted population (columns)
272            -'all': normalizes to the whole population
273            Defaults to None.
274
275    Returns:
276        None
277    """
278    
279    cm = skmetrics.confusion_matrix(yTest.flatten(),pred.flatten(), normalize=normalize)
280    if lookupDict:
281        displayLabels = [lookupDict[str(label)] for label in sorted(map(int, lookupDict.keys()))]
282        disp = skmetrics.ConfusionMatrixDisplay(cm, display_labels=displayLabels)
283    else:
284        disp = skmetrics.ConfusionMatrixDisplay(cm)
285    disp.plot()
286    if savePath and label:
287        path = os.path.join(savePath, f'{label}_confusion_matrix.png')
288        print(f'Saving confusion matirix to: {path}')
289        plt.savefig(path)
290    return None

Display a confusion matrix for evaluation.

Arguments:
  • yTest (array-like): True labels.
  • pred (array-like): Predicted labels.
  • lookupDict (dict, optional): Dictionary for mapping label indices to their actual labels. Defaults to None.
  • savePath (str, optional): Path to save the plot. Defaults to None.
  • label (str, optional): Label for the plot. Defaults to None.
  • normalize (str, optional): Type of normalization for the confusion matrix.
    • None: will not normalize -'true': normalizes over true population (rows) -'pred': normalizes over predicted population (columns) -'all': normalizes to the whole population Defaults to None.
Returns:

None

def display_rand_arrays(dataArray, labelArray, n, pathToLookup=None):
292def display_rand_arrays(dataArray, labelArray, n, pathToLookup=None):
293    """Display a random selection of arrays with corresponding labels. Useful when trainign a model to make sure preprocessing
294    step achieved the desired result. 
295
296    Args:
297        dataArray (array-like): Array containing data.
298        labelArray (array-like): Array containing labels.
299        n (int): Number of arrays to display.
300        pathToLookup (str, optional): Path to the JSON file containing label lookup. Defaults to None.
301
302    Returns:
303        None
304    """
305    i_vals = random.sample(range(len(dataArray)), n)
306    
307    if pathToLookup:
308    # Load the label lookup dictionary from the JSON file
309        with open(pathToLookup, 'r') as f:
310            labelLookup = json.load(f)
311        
312    for i in i_vals:
313        # Plot the grid
314        plt.imshow(dataArray[i], cmap='gray')
315        
316        # Get the corresponding label from the lookup dictionary
317        if pathToLookup:
318            label = labelLookup[str(labelArray[i][0])]  # Convert label to string if needed
319        else:
320            label = labelArray[i][0]
321        # Set the title for the plot
322        plt.title(f'Label: {label}')
323
324        plt.colorbar()  # Add a colorbar for reference
325        plt.show()  # Show the plot for each iteration
326    return None

Display a random selection of arrays with corresponding labels. Useful when trainign a model to make sure preprocessing step achieved the desired result.

Arguments:
  • dataArray (array-like): Array containing data.
  • labelArray (array-like): Array containing labels.
  • n (int): Number of arrays to display.
  • pathToLookup (str, optional): Path to the JSON file containing label lookup. Defaults to None.
Returns:

None

def get_arrays_from_json_params(params: str, loadDf: bool = True):
328def get_arrays_from_json_params(params:str, loadDf:bool = True):
329    """Load data and label arrays from a JSON parameter file.
330
331    Args:
332        params (str): Path to the JSON parameter file.
333        loadDf (bool, optional): Whether to load a GeoDataFrame. Defaults to True.
334
335    Returns:
336        tuple: A tuple containing data array, label array, number of rows, number of columns, and GeoDataFrame (if loaded).
337
338    """
339    with open(params, 'r') as f:
340        params_dict = json.load(f)
341    saveDirectory=params_dict['saveDirectory']
342    label = params_dict['label']
343    savebaseName = os.path.join(saveDirectory, '{}_{{}}.npy'.format(label))
344                     
345    X = np.load(savebaseName.format('data'), allow_pickle=True)
346    y = np.load(savebaseName.format('labels'), allow_pickle=True)
347    nrows = X.shape[1]
348    ncols = X.shape[2]
349    print(X.shape,y.shape)
350    print('NaN Value Count in X:', np.sum(np.isnan(X))) #Check for NaN Vals
351
352    if loadDf:
353        dfPath = os.path.join(saveDirectory, f'{label}_df.shp')
354        df = gpd.read_file(dfPath)
355    else:
356         df=None
357    return X, y, nrows, ncols, df

Load data and label arrays from a JSON parameter file.

Arguments:
  • params (str): Path to the JSON parameter file.
  • loadDf (bool, optional): Whether to load a GeoDataFrame. Defaults to True.
Returns:

tuple: A tuple containing data array, label array, number of rows, number of columns, and GeoDataFrame (if loaded).

def get_arrays_from_file(label: str, saveDir: str, loadDf: bool = True):
359def get_arrays_from_file(label:str, saveDir:str, loadDf:bool=True):
360    """ Load data and label arrays from a file.
361
362    Args:
363        label (str): Label for the arrays.
364        saveDir (str): Directory where the arrays are saved.
365        loadDf (bool, optional): Whether to load a GeoDataFrame. Defaults to True.
366
367    Returns:
368        tuple: A tuple containing data array, label array, number of rows, number of columns, and GeoDataFrame (if loaded).
369    """
370    fname = f'{label}_{{}}.npy'
371    fbasename = os.path.join(saveDir, fname)
372    X = np.load(fbasename.format('data'), allow_pickle=True)
373    y = np.load(fbasename.format('labels'), allow_pickle=True)
374    nrows = X.shape[1]
375    ncols = X.shape[2]
376    
377    print(X.shape,y.shape)
378    print('NaN Value Count in X:', np.sum(np.isnan(X))) #Check for NaN Vals
379    if loadDf:
380        dfPath = os.path.join(saveDir, f'{label}_df.shp')
381        df = gpd.read_file(dfPath)
382    else:
383         df=None
384    return X, y, nrows, ncols, df

Load data and label arrays from a file.

Arguments:
  • label (str): Label for the arrays.
  • saveDir (str): Directory where the arrays are saved.
  • loadDf (bool, optional): Whether to load a GeoDataFrame. Defaults to True.
Returns:

tuple: A tuple containing data array, label array, number of rows, number of columns, and GeoDataFrame (if loaded).

def subset_processed_df(saveDirectory: str, label: str, subset: int, newLabel: str = None):
387def subset_processed_df(saveDirectory:str, label:str, subset:int, newLabel:str=None):
388    """Subset a processed DataFrame based on a specified label and subset.
389
390    Args:
391        saveDirectory (str): Directory where the processed data is saved.
392        label (str): Label for the data.
393        subset (int): Size of the subset.
394        newLabel (str, optional): New label for the subset. Defaults to None.
395
396    Returns:
397        tuple: A tuple containing data array, label array, and subset DataFrame.
398    """
399    #load df, label, and data arrays
400    X, y, _, _, df = get_arrays_from_file(label, saveDirectory)
401
402    #Put data and label arrays back into df
403    df[X_COL_LABEL] = [x for x in X]
404    df[Y_COL_LABEL] = [lab[0] for lab in y]
405    subsetDf = pp.subset_df(df, subset)
406
407    dataArray, labelArray = pp.make_arrays(subsetDf)
408
409    if newLabel:
410        savebaseName = os.path.join(saveDirectory, '{}_{{}}.npy'.format(newLabel))
411        dfOutPath = os.path.join(saveDirectory, f'{newLabel}_df.shp')
412    else:
413        savebaseName = os.path.join(saveDirectory, '{}_subset_{{}}.npy'.format(label))
414        dfOutPath = os.path.join(saveDirectory, f'{label}_subset_df.shp')
415
416    #Format basename for data and label arrays
417    dataArrayPath = savebaseName.format('data')
418    labelArrayPath = savebaseName.format('labels')
419
420    #Save Data and Label Arrays
421    np.save(dataArrayPath, dataArray)
422    np.save(labelArrayPath, labelArray)
423
424    #Save df
425    outDf = subsetDf.drop(columns = [X_COL_LABEL])
426    outDf.to_file(dfOutPath)
427    print(f'Data Array Path: {dataArrayPath}\nLabel Array Path: {labelArrayPath}\nDataframe Path: {dfOutPath}')
428
429    return dataArray, labelArray, subsetDf

Subset a processed DataFrame based on a specified label and subset.

Arguments:
  • saveDirectory (str): Directory where the processed data is saved.
  • label (str): Label for the data.
  • subset (int): Size of the subset.
  • newLabel (str, optional): New label for the subset. Defaults to None.
Returns:

tuple: A tuple containing data array, label array, and subset DataFrame.