Loading train_model.py +49 −14 Original line number Diff line number Diff line Loading @@ -7,6 +7,8 @@ import pandas as pd import numpy import pickle import os.path # Checking existence of paths from sklearn.model_selection import train_test_split from sklearn.metrics import confusion_matrix from sklearn.metrics import matthews_corrcoef Loading Loading @@ -255,23 +257,35 @@ def normalize_data(data, selected_columns, normalizing_column): #%% if __name__ == "__main__": # The script takes a number of files as input # Loads the JMH data, SW & HW performance counters # Merges them to get a well built data file # And train the classifier. #%% print("Loading training data...") jmh_with_best = load_jmh_data("data/jmh-results-34f6dc24.csv") def load_training_data(jmh_results_filename, software_counters_filename, hardware_counters_filename): """ A function that loads the training data for training JBrainy input: - The name of the file containing the JMH benchmark data - The name of the software counters file - The name of the hardware counters filename returns: A report containing the data and some metadata, like the selected SW columns and HW columns and the names of the data structures we will be dealing with """ # Precondition: All the provided files must exist. for file in [jmh_results_filename, software_counters_filename, hardware_counters_filename]: if os.path.isfile(file): # OK. continue else: raise "The path '{0}' does not exist".format(file) # Ok here we go jmh_with_best = load_jmh_data(jmh_results_filename) jmh_with_best["Sample weight"] = compute_sample_weights(jmh_with_best) software_data = load_software_counters("data/software-perf.csv") software_data = load_software_counters(software_counters_filename) software_with_jmh = merge_jmh_software(jmh_with_best, software_data) software_selected_columns = [c for c in software_with_jmh.columns if "run" in c] software_selected_columns = software_data.columns software_features = software_with_jmh.get(software_selected_columns) data_structure_names=software_with_jmh["Param: datastructureName_best"].sort_values().unique() papi_data = load_hardware_counters("data/hardware-perf-50.csv") papi_data = load_hardware_counters(hardware_counters_filename) papi_data['size'] = jmh_with_best['Param: applicationSize'] hardware_selected_columns = [c for c in papi_data.columns if "PAPI" in c] hardware_features = papi_data.get(hardware_selected_columns) Loading @@ -280,10 +294,31 @@ if __name__ == "__main__": # apps where they were the best choice and apps where they were used. software_plus_hardware = software_with_jmh.join(hardware_features) sw_hw_cleaned = remove_weakmap_hashmap(software_plus_hardware) return { "data" : sw_hw_cleaned, "data_structure_names" : software_plus_hardware["Param: datastructureName_best"].sort_values().unique(), "software_selected_columns": software_selected_columns, "hardware_selected_columns": hardware_selected_columns } if __name__ == "__main__": # The script takes a number of files as input # Loads the JMH data, SW & HW performance counters # Merges them to get a well built data file # And train the classifier. #%% print("Loading training data...") training_data = load_training_data("data/jmh-results-9307f70f.csv", "software-perf.csv", "data/hardware-perf-data.csv") print("Training data loaded") #%% sw_hw_cleaned = training_data["data"] hardware_selected_columns = training_data["hardware_selected_columns"] software_selected_columns = training_data["software_selected_columns"] print("Normalizing data") hw_normalized_data = normalize_data(sw_hw_cleaned, hardware_selected_columns, Loading Loading
train_model.py +49 −14 Original line number Diff line number Diff line Loading @@ -7,6 +7,8 @@ import pandas as pd import numpy import pickle import os.path # Checking existence of paths from sklearn.model_selection import train_test_split from sklearn.metrics import confusion_matrix from sklearn.metrics import matthews_corrcoef Loading Loading @@ -255,23 +257,35 @@ def normalize_data(data, selected_columns, normalizing_column): #%% if __name__ == "__main__": # The script takes a number of files as input # Loads the JMH data, SW & HW performance counters # Merges them to get a well built data file # And train the classifier. #%% print("Loading training data...") jmh_with_best = load_jmh_data("data/jmh-results-34f6dc24.csv") def load_training_data(jmh_results_filename, software_counters_filename, hardware_counters_filename): """ A function that loads the training data for training JBrainy input: - The name of the file containing the JMH benchmark data - The name of the software counters file - The name of the hardware counters filename returns: A report containing the data and some metadata, like the selected SW columns and HW columns and the names of the data structures we will be dealing with """ # Precondition: All the provided files must exist. for file in [jmh_results_filename, software_counters_filename, hardware_counters_filename]: if os.path.isfile(file): # OK. continue else: raise "The path '{0}' does not exist".format(file) # Ok here we go jmh_with_best = load_jmh_data(jmh_results_filename) jmh_with_best["Sample weight"] = compute_sample_weights(jmh_with_best) software_data = load_software_counters("data/software-perf.csv") software_data = load_software_counters(software_counters_filename) software_with_jmh = merge_jmh_software(jmh_with_best, software_data) software_selected_columns = [c for c in software_with_jmh.columns if "run" in c] software_selected_columns = software_data.columns software_features = software_with_jmh.get(software_selected_columns) data_structure_names=software_with_jmh["Param: datastructureName_best"].sort_values().unique() papi_data = load_hardware_counters("data/hardware-perf-50.csv") papi_data = load_hardware_counters(hardware_counters_filename) papi_data['size'] = jmh_with_best['Param: applicationSize'] hardware_selected_columns = [c for c in papi_data.columns if "PAPI" in c] hardware_features = papi_data.get(hardware_selected_columns) Loading @@ -280,10 +294,31 @@ if __name__ == "__main__": # apps where they were the best choice and apps where they were used. software_plus_hardware = software_with_jmh.join(hardware_features) sw_hw_cleaned = remove_weakmap_hashmap(software_plus_hardware) return { "data" : sw_hw_cleaned, "data_structure_names" : software_plus_hardware["Param: datastructureName_best"].sort_values().unique(), "software_selected_columns": software_selected_columns, "hardware_selected_columns": hardware_selected_columns } if __name__ == "__main__": # The script takes a number of files as input # Loads the JMH data, SW & HW performance counters # Merges them to get a well built data file # And train the classifier. #%% print("Loading training data...") training_data = load_training_data("data/jmh-results-9307f70f.csv", "software-perf.csv", "data/hardware-perf-data.csv") print("Training data loaded") #%% sw_hw_cleaned = training_data["data"] hardware_selected_columns = training_data["hardware_selected_columns"] software_selected_columns = training_data["software_selected_columns"] print("Normalizing data") hw_normalized_data = normalize_data(sw_hw_cleaned, hardware_selected_columns, Loading