...
 
Commits (7)
This diff is collapsed.
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -7,6 +7,8 @@ import pandas as pd
import numpy
import pickle
import os.path # Checking existence of paths
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import matthews_corrcoef
......@@ -25,7 +27,7 @@ def split_data(data, selected_columns, weight_column, label_column):
data: a DataFrame to split
selected_columns: A list of names of selected columns
weight_column: The name of the column to use as sample weight
label_column: The name of the column to use as target labels
label_column: The name of the column to use as target labels
(for classification)
Returns: a tuple (X_train, X_test, y_train, y_test, w_train, w_test)
"""
......@@ -103,15 +105,20 @@ def load_jmh_data(filename):
worst_case_times = jmh_data["Score"] - jmh_data["Score Error (99.9%)"]
jmh_data["Lowest score"] = worst_case_times
# We filter out the jmh rows where the lowest score is equal to zero
# This is potentially why some values were dropped in the resulting table,
# even if their data structure type was not a WeakHashMap or an
# IdentityHashMap
jmh_data_filtered = jmh_data[jmh_data["Lowest score"] > 0]
# Grouping the applications, to compare similar ones.
selected_jmh_columns = ["Param: seed",
"Param: applicationSize",
"Param: baseStructureSize",
"Param: baseStructureSize"
"Benchmark"]
# Best data structures
jmh_best_structures = jmh_data_filtered.sort_values(by="Lowest score",
ascending=False).drop_duplicates(selected_jmh_columns)
jmh_best_structures = jmh_data_filtered\
.sort_values(by="Lowest score",
ascending=False)\
.drop_duplicates(selected_jmh_columns)
jmh_best_structures = jmh_best_structures.reset_index(drop=True)
# # Best data structures, computing the improvement
......@@ -127,7 +134,7 @@ def load_jmh_data(filename):
#%%
#%%
def compute_sample_weights(jmh_data):
"""
......@@ -136,7 +143,7 @@ def compute_sample_weights(jmh_data):
The weight is computer by computing the ratio of improvement,
and adding it's log to 1/N, where N is the number of samples
"""
ratios = jmh_data["Lowest score_best"] / jmh_with_best["Lowest score"]
ratios = jmh_data["Lowest score_best"] / jmh_data["Lowest score"]
sample_weights = (1 / ratios.shape[0]) + numpy.log(ratios)
return sample_weights
......@@ -147,7 +154,7 @@ def load_software_counters(filename):
Takes a filename for a csv file containing the software performance counters
So with the following header:
seed,size,base_structure_size,data_structure,method,count
And returns a table with columns matching the column name and the
And returns a table with columns matching the column name and the
values being the number of times the method has been called.
"""
# Software performance counters
......@@ -165,7 +172,7 @@ def load_software_counters(filename):
return software_pivot_data
def merge_jmh_software(jmh_data, software_data):
def merge_jmh_software(jmh_data, software_data):
"""
Function that takes a table of JMH data, and a table with the software
counters and merges them.
......@@ -182,7 +189,7 @@ def merge_jmh_software(jmh_data, software_data):
right_on=right_columns)
return software_with_jmh
#%%
#%%
# Hardware performance counters
# We load PAPI counters.
......@@ -202,7 +209,7 @@ def load_hardware_counters(filename):
papi_data = pd.read_csv(filename)
return papi_data
#%%
#%%
def remove_weakmap_hashmap(dataframe):
"""
......@@ -255,23 +262,35 @@ def normalize_data(data, selected_columns, normalizing_column):
#%%
if __name__ == "__main__":
# The script takes a number of files as input
# Loads the JMH data, SW & HW performance counters
# Merges them to get a well built data file
# And train the classifier.
#%%
print("Loading training data...")
jmh_with_best = load_jmh_data("data/jmh-results-34f6dc24.csv")
def load_training_data(jmh_results_filename,
software_counters_filename,
hardware_counters_filename):
""" A function that loads the training data for training JBrainy
input: - The name of the file containing the JMH benchmark data
- The name of the software counters file
- The name of the hardware counters filename
returns: A report containing the data
and some metadata, like the selected SW columns and HW columns
and the names of the data structures we will be dealing with
"""
# Precondition: All the provided files must exist.
for file in [jmh_results_filename,
software_counters_filename,
hardware_counters_filename]:
if os.path.isfile(file):
# OK.
continue
else:
raise "The path '{0}' does not exist".format(file)
# Ok here we go
jmh_with_best = load_jmh_data(jmh_results_filename)
jmh_with_best["Sample weight"] = compute_sample_weights(jmh_with_best)
software_data = load_software_counters("data/software-perf.csv")
software_data = load_software_counters(software_counters_filename)
software_with_jmh = merge_jmh_software(jmh_with_best, software_data)
software_selected_columns = [c for c in software_with_jmh.columns if "run" in c]
software_selected_columns = software_data.columns
software_features = software_with_jmh.get(software_selected_columns)
data_structure_names=software_with_jmh["Param: datastructureName_best"].sort_values().unique()
papi_data = load_hardware_counters("data/hardware-perf-50.csv")
papi_data = load_hardware_counters(hardware_counters_filename)
papi_data['size'] = jmh_with_best['Param: applicationSize']
hardware_selected_columns = [c for c in papi_data.columns if "PAPI" in c]
hardware_features = papi_data.get(hardware_selected_columns)
......@@ -280,10 +299,31 @@ if __name__ == "__main__":
# apps where they were the best choice and apps where they were used.
software_plus_hardware = software_with_jmh.join(hardware_features)
sw_hw_cleaned = remove_weakmap_hashmap(software_plus_hardware)
return { "data" : sw_hw_cleaned,
"data_structure_names" : software_plus_hardware["Param: datastructureName_best"].sort_values().unique(),
"software_selected_columns": software_selected_columns,
"hardware_selected_columns": hardware_selected_columns }
if __name__ == "__main__":
# The script takes a number of files as input
# Loads the JMH data, SW & HW performance counters
# Merges them to get a well built data file
# And train the classifier.
#%%
print("Loading training data...")
training_data = load_training_data("data/jmh-results-9307f70f.csv",
"software-perf.csv",
"data/hardware-perf-data.csv")
print("Training data loaded")
#%%
sw_hw_cleaned = training_data["data"]
hardware_selected_columns = training_data["hardware_selected_columns"]
software_selected_columns = training_data["software_selected_columns"]
print("Normalizing data")
hw_normalized_data = normalize_data(sw_hw_cleaned,
hardware_selected_columns,
......@@ -309,7 +349,8 @@ if __name__ == "__main__":
axis=1)
poly_transformer = PolynomialFeatures(degree=2)
features_extended_poly = poly_transformer.fit_transform(features_extended)
features_extended_poly = poly_transformer.fit_transform(features_extended,
features_extended.columns.values)
features_extended_norm = StandardScaler().fit_transform(features_extended_poly)
labels = sw_hw_cleaned["Param: datastructureName_best"]
......@@ -323,15 +364,20 @@ if __name__ == "__main__":
sw_hw_cleaned["Sample weight"],
stratify=labels,
test_size=0.4)
#%%
# Fitting without the sample weights gives better
# Fitting without the sample weights gives better
# Accuracy than with it, not sure why.
classifier = RandomForestClassifier()
classifier.fit(X_train, y_train, w_train)
# We just add it to the class
classifier.feature_names = poly_transformer \
.get_feature_names(features_extended.columns.values)
print("Accuracy: {0}".format(classifier.score(X_test, y_test, w_test)))
print("Classifier trained")
......