Commit a91a367a authored by Noric Couderc's avatar Noric Couderc

Added function for loading training data in model training script

This function is made so that it can be used to load and have nicely
prepared training data.
parent 59a9ee84
......@@ -7,6 +7,8 @@ import pandas as pd
import numpy
import pickle
import os.path # Checking existence of paths
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import matthews_corrcoef
......@@ -255,23 +257,35 @@ def normalize_data(data, selected_columns, normalizing_column):
#%%
if __name__ == "__main__":
# The script takes a number of files as input
# Loads the JMH data, SW & HW performance counters
# Merges them to get a well built data file
# And train the classifier.
#%%
print("Loading training data...")
jmh_with_best = load_jmh_data("data/jmh-results-34f6dc24.csv")
def load_training_data(jmh_results_filename,
software_counters_filename,
hardware_counters_filename):
""" A function that loads the training data for training JBrainy
input: - The name of the file containing the JMH benchmark data
- The name of the software counters file
- The name of the hardware counters filename
returns: A report containing the data
and some metadata, like the selected SW columns and HW columns
and the names of the data structures we will be dealing with
"""
# Precondition: All the provided files must exist.
for file in [jmh_results_filename,
software_counters_filename,
hardware_counters_filename]:
if os.path.isfile(file):
# OK.
continue
else:
raise "The path '{0}' does not exist".format(file)
# Ok here we go
jmh_with_best = load_jmh_data(jmh_results_filename)
jmh_with_best["Sample weight"] = compute_sample_weights(jmh_with_best)
software_data = load_software_counters("data/software-perf.csv")
software_data = load_software_counters(software_counters_filename)
software_with_jmh = merge_jmh_software(jmh_with_best, software_data)
software_selected_columns = [c for c in software_with_jmh.columns if "run" in c]
software_selected_columns = software_data.columns
software_features = software_with_jmh.get(software_selected_columns)
data_structure_names=software_with_jmh["Param: datastructureName_best"].sort_values().unique()
papi_data = load_hardware_counters("data/hardware-perf-50.csv")
papi_data = load_hardware_counters(hardware_counters_filename)
papi_data['size'] = jmh_with_best['Param: applicationSize']
hardware_selected_columns = [c for c in papi_data.columns if "PAPI" in c]
hardware_features = papi_data.get(hardware_selected_columns)
......@@ -280,10 +294,31 @@ if __name__ == "__main__":
# apps where they were the best choice and apps where they were used.
software_plus_hardware = software_with_jmh.join(hardware_features)
sw_hw_cleaned = remove_weakmap_hashmap(software_plus_hardware)
return { "data" : sw_hw_cleaned,
"data_structure_names" : software_plus_hardware["Param: datastructureName_best"].sort_values().unique(),
"software_selected_columns": software_selected_columns,
"hardware_selected_columns": hardware_selected_columns }
if __name__ == "__main__":
# The script takes a number of files as input
# Loads the JMH data, SW & HW performance counters
# Merges them to get a well built data file
# And train the classifier.
#%%
print("Loading training data...")
training_data = load_training_data("data/jmh-results-9307f70f.csv",
"software-perf.csv",
"data/hardware-perf-data.csv")
print("Training data loaded")
#%%
sw_hw_cleaned = training_data["data"]
hardware_selected_columns = training_data["hardware_selected_columns"]
software_selected_columns = training_data["software_selected_columns"]
print("Normalizing data")
hw_normalized_data = normalize_data(sw_hw_cleaned,
hardware_selected_columns,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment