Commit b8db1a9d authored by Noric Couderc's avatar Noric Couderc


parent 8728290d
......@@ -27,7 +27,7 @@ def split_data(data, selected_columns, weight_column, label_column):
data: a DataFrame to split
selected_columns: A list of names of selected columns
weight_column: The name of the column to use as sample weight
label_column: The name of the column to use as target labels
label_column: The name of the column to use as target labels
(for classification)
Returns: a tuple (X_train, X_test, y_train, y_test, w_train, w_test)
......@@ -105,15 +105,20 @@ def load_jmh_data(filename):
worst_case_times = jmh_data["Score"] - jmh_data["Score Error (99.9%)"]
jmh_data["Lowest score"] = worst_case_times
# We filter out the jmh rows where the lowest score is equal to zero
# This is potentially why some values were dropped in the resulting table,
# even if their data structure type was not a WeakHashMap or an
# IdentityHashMap
jmh_data_filtered = jmh_data[jmh_data["Lowest score"] > 0]
# Grouping the applications, to compare similar ones.
selected_jmh_columns = ["Param: seed",
"Param: applicationSize",
"Param: baseStructureSize",
"Param: baseStructureSize"
# Best data structures
jmh_best_structures = jmh_data_filtered.sort_values(by="Lowest score",
jmh_best_structures = jmh_data_filtered\
.sort_values(by="Lowest score",
jmh_best_structures = jmh_best_structures.reset_index(drop=True)
# # Best data structures, computing the improvement
......@@ -129,7 +134,7 @@ def load_jmh_data(filename):
def compute_sample_weights(jmh_data):
......@@ -149,7 +154,7 @@ def load_software_counters(filename):
Takes a filename for a csv file containing the software performance counters
So with the following header:
And returns a table with columns matching the column name and the
And returns a table with columns matching the column name and the
values being the number of times the method has been called.
# Software performance counters
......@@ -167,7 +172,7 @@ def load_software_counters(filename):
return software_pivot_data
def merge_jmh_software(jmh_data, software_data):
def merge_jmh_software(jmh_data, software_data):
Function that takes a table of JMH data, and a table with the software
counters and merges them.
......@@ -184,7 +189,7 @@ def merge_jmh_software(jmh_data, software_data):
return software_with_jmh
# Hardware performance counters
# We load PAPI counters.
......@@ -204,7 +209,7 @@ def load_hardware_counters(filename):
papi_data = pd.read_csv(filename)
return papi_data
def remove_weakmap_hashmap(dataframe):
......@@ -262,7 +267,7 @@ def load_training_data(jmh_results_filename,
""" A function that loads the training data for training JBrainy
input: - The name of the file containing the JMH benchmark data
- The name of the software counters file
- The name of the software counters file
- The name of the hardware counters filename
returns: A report containing the data
and some metadata, like the selected SW columns and HW columns
......@@ -305,7 +310,7 @@ if __name__ == "__main__":
# Merges them to get a well built data file
# And train the classifier.
print("Loading training data...")
training_data = load_training_data("data/jmh-results-9307f70f.csv",
......@@ -344,7 +349,8 @@ if __name__ == "__main__":
poly_transformer = PolynomialFeatures(degree=2)
features_extended_poly = poly_transformer.fit_transform(features_extended)
features_extended_poly = poly_transformer.fit_transform(features_extended,
features_extended_norm = StandardScaler().fit_transform(features_extended_poly)
labels = sw_hw_cleaned["Param: datastructureName_best"]
......@@ -361,7 +367,7 @@ if __name__ == "__main__":
# Fitting without the sample weights gives better
# Fitting without the sample weights gives better
# Accuracy than with it, not sure why.
classifier = RandomForestClassifier()
