Commit a91a367a authored by Noric Couderc's avatar Noric Couderc
Browse files

Added function for loading training data in model training script

This function is made so that it can be used to load and have nicely
prepared training data.
parent 59a9ee84
Loading
Loading
Loading
Loading
+49 −14
Original line number Diff line number Diff line
@@ -7,6 +7,8 @@ import pandas as pd
import numpy
import pickle

import os.path # Checking existence of paths

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import matthews_corrcoef
@@ -255,23 +257,35 @@ def normalize_data(data, selected_columns, normalizing_column):
#%%


if __name__ == "__main__":
    # The script takes a number of files as input
    # Loads the JMH data, SW & HW performance counters
    # Merges them to get a well built data file
    # And train the classifier.

    #%% 

    print("Loading training data...")
    jmh_with_best = load_jmh_data("data/jmh-results-34f6dc24.csv")
def load_training_data(jmh_results_filename,
                       software_counters_filename,
                       hardware_counters_filename):
    """ A function that loads the training data for training JBrainy
        input: - The name of the file containing the JMH benchmark data
               - The name of the software counters file 
               - The name of the hardware counters filename
        returns: A report containing the data
            and some metadata, like the selected SW columns and HW columns
            and the names of the data structures we will be dealing with
    """
    # Precondition: All the provided files must exist.
    for file in [jmh_results_filename,
                 software_counters_filename,
                 hardware_counters_filename]:
        if os.path.isfile(file):
            # OK.
            continue
        else:
            raise "The path '{0}' does not exist".format(file)

    # Ok here we go
    jmh_with_best = load_jmh_data(jmh_results_filename)
    jmh_with_best["Sample weight"] = compute_sample_weights(jmh_with_best)
    software_data = load_software_counters("data/software-perf.csv")
    software_data = load_software_counters(software_counters_filename)
    software_with_jmh = merge_jmh_software(jmh_with_best, software_data)
    software_selected_columns = [c for c in software_with_jmh.columns if "run" in c]
    software_selected_columns = software_data.columns
    software_features = software_with_jmh.get(software_selected_columns)
    data_structure_names=software_with_jmh["Param: datastructureName_best"].sort_values().unique()
    papi_data = load_hardware_counters("data/hardware-perf-50.csv")
    papi_data = load_hardware_counters(hardware_counters_filename)
    papi_data['size'] = jmh_with_best['Param: applicationSize']
    hardware_selected_columns = [c for c in papi_data.columns if "PAPI" in c]
    hardware_features = papi_data.get(hardware_selected_columns)
@@ -280,10 +294,31 @@ if __name__ == "__main__":
    # apps where they were the best choice and apps where they were used.
    software_plus_hardware = software_with_jmh.join(hardware_features)
    sw_hw_cleaned = remove_weakmap_hashmap(software_plus_hardware)
    return { "data" : sw_hw_cleaned,
             "data_structure_names" : software_plus_hardware["Param: datastructureName_best"].sort_values().unique(),
             "software_selected_columns": software_selected_columns,
             "hardware_selected_columns": hardware_selected_columns }

if __name__ == "__main__":
    # The script takes a number of files as input
    # Loads the JMH data, SW & HW performance counters
    # Merges them to get a well built data file
    # And train the classifier.

    #%% 

    print("Loading training data...")
    training_data = load_training_data("data/jmh-results-9307f70f.csv",
                                       "software-perf.csv",
                                       "data/hardware-perf-data.csv")
    print("Training data loaded")

    #%%

    sw_hw_cleaned = training_data["data"]
    hardware_selected_columns = training_data["hardware_selected_columns"]
    software_selected_columns = training_data["software_selected_columns"]

    print("Normalizing data")
    hw_normalized_data = normalize_data(sw_hw_cleaned,
                                        hardware_selected_columns,