Commit 33190962 authored by Noric Couderc's avatar Noric Couderc
Browse files

Fix: Train-test split takes completely new benchmarks for test set

The sklearn original version doesn't take fresh benchmarks, while this
function does. It makes sure to use benchmarks in the test set which
are not present at all in the training set.

We still get roughly 70% accuracy with some small test data, that's encouraging!
parent 9a14d747
......@@ -17,7 +17,6 @@ from sklearn.base import clone
from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import confusion_matrix, f1_score, matthews_corrcoef
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
# # Utility functions
......@@ -348,6 +347,34 @@ def prepare_features(dataframe, hw_selected_cols, sw_selected_cols):
return pd.concat([hw_normalized_data, sw_normalized_data], axis=1)
def train_test_split(training_data, label_column, feature_columns, test_size=0.25):
Splits the training data into a training set and test set,
making sure the test set contains only benchmarks that are /not/
in the training data
Returns (X_train, X_test, y_train, y_test)
assert "benchmark_id" in training_data.columns
benchmarks = training_data["benchmark_id"].drop_duplicates()
benchmark_filter = numpy.random.choice(a=[True, False], size=len(benchmarks),
p=[test_size, 1 - test_size])
test_benchmarks = benchmarks[benchmark_filter]
training_benchmarks = benchmarks[~benchmark_filter]
train_data = training_data.merge(training_benchmarks)
test_data = training_data.merge(test_benchmarks)
X_train = train_data.get(feature_columns)
X_test = test_data.get(feature_columns)
y_train = train_data.get(label_column)
y_test = test_data.get(label_column)
return (X_train, X_test, y_train, y_test)
def train_classifiers(training_data, create_classifier, label_column, feature_columns, grouping_column):
Splits the data by collection to replace and trains
......@@ -378,10 +405,9 @@ def train_classifiers(training_data, create_classifier, label_column, feature_co
# software_selected_columns)
features = group.get(feature_columns)
labels = group[label_column]
X_train, X_test, y_train, y_test = train_test_split(features,
# group["Sample weight"],
X_train, X_test, y_train, y_test = train_test_split(group,
# We train the classifier
classifier = create_classifier()
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment