Commit b8db1a9d authored by Noric Couderc's avatar Noric Couderc

Formatting

parent 8728290d
...@@ -27,7 +27,7 @@ def split_data(data, selected_columns, weight_column, label_column): ...@@ -27,7 +27,7 @@ def split_data(data, selected_columns, weight_column, label_column):
data: a DataFrame to split data: a DataFrame to split
selected_columns: A list of names of selected columns selected_columns: A list of names of selected columns
weight_column: The name of the column to use as sample weight weight_column: The name of the column to use as sample weight
label_column: The name of the column to use as target labels label_column: The name of the column to use as target labels
(for classification) (for classification)
Returns: a tuple (X_train, X_test, y_train, y_test, w_train, w_test) Returns: a tuple (X_train, X_test, y_train, y_test, w_train, w_test)
""" """
...@@ -105,15 +105,20 @@ def load_jmh_data(filename): ...@@ -105,15 +105,20 @@ def load_jmh_data(filename):
worst_case_times = jmh_data["Score"] - jmh_data["Score Error (99.9%)"] worst_case_times = jmh_data["Score"] - jmh_data["Score Error (99.9%)"]
jmh_data["Lowest score"] = worst_case_times jmh_data["Lowest score"] = worst_case_times
# We filter out the jmh rows where the lowest score is equal to zero # We filter out the jmh rows where the lowest score is equal to zero
# This is potentially why some values were dropped in the resulting table,
# even if their data structure type was not a WeakHashMap or an
# IdentityHashMap
jmh_data_filtered = jmh_data[jmh_data["Lowest score"] > 0] jmh_data_filtered = jmh_data[jmh_data["Lowest score"] > 0]
# Grouping the applications, to compare similar ones. # Grouping the applications, to compare similar ones.
selected_jmh_columns = ["Param: seed", selected_jmh_columns = ["Param: seed",
"Param: applicationSize", "Param: applicationSize",
"Param: baseStructureSize", "Param: baseStructureSize"
"Benchmark"] "Benchmark"]
# Best data structures # Best data structures
jmh_best_structures = jmh_data_filtered.sort_values(by="Lowest score", jmh_best_structures = jmh_data_filtered\
ascending=False).drop_duplicates(selected_jmh_columns) .sort_values(by="Lowest score",
ascending=False)\
.drop_duplicates(selected_jmh_columns)
jmh_best_structures = jmh_best_structures.reset_index(drop=True) jmh_best_structures = jmh_best_structures.reset_index(drop=True)
# # Best data structures, computing the improvement # # Best data structures, computing the improvement
...@@ -129,7 +134,7 @@ def load_jmh_data(filename): ...@@ -129,7 +134,7 @@ def load_jmh_data(filename):
#%% #%%
def compute_sample_weights(jmh_data): def compute_sample_weights(jmh_data):
""" """
...@@ -149,7 +154,7 @@ def load_software_counters(filename): ...@@ -149,7 +154,7 @@ def load_software_counters(filename):
Takes a filename for a csv file containing the software performance counters Takes a filename for a csv file containing the software performance counters
So with the following header: So with the following header:
seed,size,base_structure_size,data_structure,method,count seed,size,base_structure_size,data_structure,method,count
And returns a table with columns matching the column name and the And returns a table with columns matching the column name and the
values being the number of times the method has been called. values being the number of times the method has been called.
""" """
# Software performance counters # Software performance counters
...@@ -167,7 +172,7 @@ def load_software_counters(filename): ...@@ -167,7 +172,7 @@ def load_software_counters(filename):
return software_pivot_data return software_pivot_data
def merge_jmh_software(jmh_data, software_data): def merge_jmh_software(jmh_data, software_data):
""" """
Function that takes a table of JMH data, and a table with the software Function that takes a table of JMH data, and a table with the software
counters and merges them. counters and merges them.
...@@ -184,7 +189,7 @@ def merge_jmh_software(jmh_data, software_data): ...@@ -184,7 +189,7 @@ def merge_jmh_software(jmh_data, software_data):
right_on=right_columns) right_on=right_columns)
return software_with_jmh return software_with_jmh
#%% #%%
# Hardware performance counters # Hardware performance counters
# We load PAPI counters. # We load PAPI counters.
...@@ -204,7 +209,7 @@ def load_hardware_counters(filename): ...@@ -204,7 +209,7 @@ def load_hardware_counters(filename):
papi_data = pd.read_csv(filename) papi_data = pd.read_csv(filename)
return papi_data return papi_data
#%% #%%
def remove_weakmap_hashmap(dataframe): def remove_weakmap_hashmap(dataframe):
""" """
...@@ -262,7 +267,7 @@ def load_training_data(jmh_results_filename, ...@@ -262,7 +267,7 @@ def load_training_data(jmh_results_filename,
hardware_counters_filename): hardware_counters_filename):
""" A function that loads the training data for training JBrainy """ A function that loads the training data for training JBrainy
input: - The name of the file containing the JMH benchmark data input: - The name of the file containing the JMH benchmark data
- The name of the software counters file - The name of the software counters file
- The name of the hardware counters filename - The name of the hardware counters filename
returns: A report containing the data returns: A report containing the data
and some metadata, like the selected SW columns and HW columns and some metadata, like the selected SW columns and HW columns
...@@ -305,7 +310,7 @@ if __name__ == "__main__": ...@@ -305,7 +310,7 @@ if __name__ == "__main__":
# Merges them to get a well built data file # Merges them to get a well built data file
# And train the classifier. # And train the classifier.
#%% #%%
print("Loading training data...") print("Loading training data...")
training_data = load_training_data("data/jmh-results-9307f70f.csv", training_data = load_training_data("data/jmh-results-9307f70f.csv",
...@@ -344,7 +349,8 @@ if __name__ == "__main__": ...@@ -344,7 +349,8 @@ if __name__ == "__main__":
axis=1) axis=1)
poly_transformer = PolynomialFeatures(degree=2) poly_transformer = PolynomialFeatures(degree=2)
features_extended_poly = poly_transformer.fit_transform(features_extended) features_extended_poly = poly_transformer.fit_transform(features_extended,
features_extended.columns.values)
features_extended_norm = StandardScaler().fit_transform(features_extended_poly) features_extended_norm = StandardScaler().fit_transform(features_extended_poly)
labels = sw_hw_cleaned["Param: datastructureName_best"] labels = sw_hw_cleaned["Param: datastructureName_best"]
...@@ -361,7 +367,7 @@ if __name__ == "__main__": ...@@ -361,7 +367,7 @@ if __name__ == "__main__":
#%% #%%
# Fitting without the sample weights gives better # Fitting without the sample weights gives better
# Accuracy than with it, not sure why. # Accuracy than with it, not sure why.
classifier = RandomForestClassifier() classifier = RandomForestClassifier()
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment