train_model.py 24.1 KB
Newer Older
1
2
3
#!/usr/bin/env python
# coding: utf-8

Noric Couderc's avatar
Noric Couderc committed
4
# Importing libraries
5

Noric Couderc's avatar
Noric Couderc committed
6
import os.path  # Checking existence of paths
7
import pickle
8
import sys
9
10
import hashlib # Hashing files
import time # Converting time stamps
11
import datetime # Getting current date
12
import subprocess # Running git...
13

Noric Couderc's avatar
Noric Couderc committed
14
15
import numpy
import pandas as pd
Noric Couderc's avatar
Noric Couderc committed
16
17
from sklearn.base import clone
from sklearn.ensemble import RandomForestClassifier
18
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
Noric Couderc's avatar
Noric Couderc committed
19
from sklearn.metrics import confusion_matrix, f1_score, matthews_corrcoef
20
from sklearn.neural_network import MLPClassifier
21
22
23
24

# # Utility functions


25
# NOT USED
26
27
28
29
30
31
def split_data(data, selected_columns, weight_column, label_column):
    """
    Split the data into a training set and a test set.
    data: a DataFrame to split
    selected_columns: A list of names of selected columns
    weight_column: The name of the column to use as sample weight
Noric Couderc's avatar
Noric Couderc committed
32
    label_column: The name of the column to use as target labels
33
34
35
                  (for classification)
    Returns: a tuple (X_train, X_test, y_train, y_test, w_train, w_test)
    """
36
    selected = data.get(selected_columns)
37
    weights = data.get(weight_column)
38
    target_labels = data[label_column]
39
40
41
42
43
    return train_test_split(selected,
                            target_labels,
                            weights,
                            stratify=target_labels,
                            test_size=0.4)
44
45


46
# NOT USED
47
48
49
50
51
52
53
54
def get_confusion_matrix(classifier, X, y_true):
    y_pred = classifier.predict(X)
    matrix = confusion_matrix(y_true, y_pred)
    # We normalize the confusion matrix
    m_norm = matrix.astype('float') / matrix.sum(axis=1)[:, numpy.newaxis]
    return m_norm


55
# NOT USED
56
def get_classifier_summary(y_pred, y_true):
Noric Couderc's avatar
Noric Couderc committed
57
58
59
    return {"F1 Score": f1_score(y_pred, y_true, average="micro"),
            "Matthews correlation coefficient":
            matthews_corrcoef(y_pred, y_true)}
60
61


62
# NOT USED
Noric Couderc's avatar
Noric Couderc committed
63
64
65
66
67
def group_by_and_train(classifier,
                       dataset,
                       grouping_columns,
                       selected_columns,
                       label_column):
68
    """
Noric Couderc's avatar
Noric Couderc committed
69
70
71
72
    Groups the rows in dataset by grouping_columns and runs a classifier for
    each group.  Displays the confusion matrices and prints various information
    about the classification.

73
74
75
76
77
78
    Arguments:
    classifier -- the classifier to perform classification
    dataset -- the dataset
    grouping_columns -- the columns to use to group rows
    selected_columns -- the columns to consider for the training
    label_column -- the column containing the target labels
Noric Couderc's avatar
Noric Couderc committed
79

80
81
82
83
84
85
    Returns:
    Nothing
    """
    for group_label, group in dataset.groupby(grouping_columns):
        features = group.get(selected_columns)
        labels = group[label_column]
Noric Couderc's avatar
Noric Couderc committed
86
87
88
        X_train, X_test, y_train, y_test = train_test_split(features,
                                                            labels,
                                                            stratify=labels)
89
90
91
92
        new_classifier = clone(classifier)
        print(new_classifier.score(X_train, y_train))
        print(new_classifier.score(X_test, y_test))

93
#%%  # Processing jmh data
94

Noric Couderc's avatar
Noric Couderc committed
95

Noric Couderc's avatar
Noric Couderc committed
96
def load_jmh_data(filename):
97
98
99
100
101
102
    """
    A function that reads a JMH data file, and gets the best
    data structure for each of the apps
    filename : A string with the path to the file
    """
    jmh_data = pd.read_csv(filename)
103
104
105
106
107
108
109
110
111
112
113
    return prepare_jmh_data(jmh_data)


def prepare_jmh_data(jmh_data):
    """
    Computes some derived features on JMH benchmark data:
    - worst case times taking into account score error (shoul be "scores")
    - filters the benchmarks which have negative worst case scores (error > score)
    - Sets the labels for the best data structure for the benchmark
    """
    worst_case_scores = jmh_data["Score"] - jmh_data["Score Error (99.9%)"]
114
    # We filter out the jmh rows where the lowest score is equal to zero
Noric Couderc's avatar
Noric Couderc committed
115
116
117
    # This is potentially why some values were dropped in the resulting table,
    # even if their data structure type was not a WeakHashMap or an
    # IdentityHashMap
118
119
120
    # Grouping the applications, to compare similar ones.
    selected_jmh_columns = ["Param: seed",
                            "Param: applicationSize",
Noric Couderc's avatar
Noric Couderc committed
121
                            "Param: baseStructureSize",
122
                            "Param: methodSelectionStrategyId",
123
124
                            "Benchmark"]
    # Best data structures
125
    jmh_best_structures = jmh_data\
126
            .sort_values(by="Score", ascending=False)\
Noric Couderc's avatar
Noric Couderc committed
127
            .drop_duplicates(selected_jmh_columns)
128
129
130
131
132
133
    jmh_best_structures = jmh_best_structures.reset_index(drop=True)

    # # Best data structures, computing the improvement
    # We may want to weigh the samples according to the improvement that was
    # achieved. Samples that do not improve much can be misclassified, it would not
    # matter
134
    jmh_with_best = pd.merge(jmh_data,
135
136
137
                             jmh_best_structures,
                             on=selected_jmh_columns,
                             suffixes=("", "_best"))
138

139
    # We remove some unnecessary columns
140
    columns_to_drop = ["Samples_best", "Score Error (99.9%)_best",
141
142
143
144
145
                       "Threads_best", "Unit_best", "Mode_best"]

    rename_dict = { "Param: datastructureName_best" : "Best datastructure"}
    return jmh_with_best.drop(columns_to_drop, axis=1)\
                        .rename(rename_dict, axis=1)
Noric Couderc's avatar
Noric Couderc committed
146

Noric Couderc's avatar
Noric Couderc committed
147
#%%
148

149
150
def compute_sample_ratios(jmh_data):
    """
151
    Takes a DataFrame with columns "Score_best" and "Score"
152
    and returns the ratio of improvement.
Noric Couderc's avatar
Noric Couderc committed
153
    (> 1)
154
    """
155
156
157
    assert "Score_best" in list(jmh_data.columns)
    assert "Score" in list(jmh_data.columns)

158
    return jmh_data["Score_best"] / jmh_data["Score"]
159
160


161
162
163
164
def compute_sample_weights(jmh_data):
    """
    Takes a DataFrame with columns "Lowest score_best" and "Lowest score"
    and returns the weights of the sample.
165
166
    The weight is zero if there is less than 5% improvement
    and 1 if there is more than 5% improvement.
167
    """
168
    ratios = compute_sample_ratios(jmh_data)
169
170
171
    # return numpy.sqrt(ratios)
    # We use the same information as for the original Brainy paper
    # If the cost is less than 5% we assign the weight to zero.
172
173
    kept_samples = (ratios > 1.05) | (ratios == 1.0)
    return (kept_samples).astype(float)
174

175

176
#%%
177

178
# NOT USED
Noric Couderc's avatar
Noric Couderc committed
179
def load_software_counters(filename):
180
181
182
183
    """
    Takes a filename for a csv file containing the software performance counters
    So with the following header:
        seed,size,base_structure_size,data_structure,method,count
Noric Couderc's avatar
Noric Couderc committed
184
    And returns a table with columns matching the column name and the
185
186
187
188
    values being the number of times the method has been called.
    """
    # Software performance counters
    software_data = pd.read_csv(filename)
Noric Couderc's avatar
Noric Couderc committed
189

190
191
192
193
194
195
196
197
198
    # Get a pivot table
    # (That is, a one hot encoding)
    software_pivot_data = software_data.pivot_table(values="count",
                                                    index=["seed",
                                                           "size",
                                                           "base_structure_size",
                                                           "data_structure"],
                                                    columns=["method"],
                                                    fill_value=0)
199
200
    return software_pivot_data

201
# NOT USED
Noric Couderc's avatar
Noric Couderc committed
202
def merge_jmh_software(jmh_data, software_data):
203
204
205
206
    """
    Function that takes a table of JMH data, and a table with the software
    counters and merges them.
    """
207
208
209
210
211
212
    # Merging with JMH data (particularly, the best data structure)
    left_columns = ["Param: seed",
                    "Param: applicationSize",
                    "Param: baseStructureSize",
                    "Param: datastructureName"]
    right_columns = ["seed", "size", "base_structure_size", "data_structure"]
213
214
    software_with_jmh = pd.merge(left=jmh_data,
                                 right=software_data.reset_index(),
215
216
217
218
                                 left_on=left_columns,
                                 right_on=right_columns)
    return software_with_jmh

Noric Couderc's avatar
Noric Couderc committed
219
#%%
220

Noric Couderc's avatar
Noric Couderc committed
221
# Hardware performance counters
222
223
224
# We load PAPI counters.

# Loading papi_data
Noric Couderc's avatar
Noric Couderc committed
225
# PAPI Perf with 50 runs
226
227
228
229
230
231
232
233
234
235
236
237
238


def load_hardware_counters(filename):
    """
    Loads a file with hardware counter data
    filename: The name of a csv file with headers:
    application,data_structure,best_data_structure,PAPI_BR_CN, PAPI_BR_INS,...
    returns: A dataframe with the hardware performance data, where indexes
    should match the JMH data
    """
    papi_data = pd.read_csv(filename)
    return papi_data

Noric Couderc's avatar
Noric Couderc committed
239
#%%
240
241
242
243
244
245
246
247

def remove_weakmap_hashmap(dataframe):
    """
    Removes lines related to WeakHashMap and IdentityHashMap
    dataframe: A dataframe with columns "Param: datastructureName" and
               "Param: datastructureName_best"
    returns: a dataframe with appropriate rows deleted.
    """
Noric Couderc's avatar
Noric Couderc committed
248
249
250
251
    mask = dataframe["Param: datastructureName"].isin(["WeakHashMap",
                                                       "IdentityHashMap"])
    mask_best = dataframe["Param: datastructureName_best"].isin(["WeakHashMap",
                                                                 "IdentityHashMap"])
252
253
254
    dataframe_cleaned = dataframe[~mask & ~mask_best]
    return dataframe_cleaned

255

256
#%%
257
258


259
# NOT USED
Noric Couderc's avatar
Noric Couderc committed
260
def generate_polynomial_features(features):
261
262
263
264
265
    """
    features: A matrix of feature vectors
    return a matrix with polynomial features, including ratios
    """
    poly_transformer = PolynomialFeatures(degree=2)
266
    # Problem: This selection scheme may give various number of features to use.
267
268
    contains_zero = (features == 0).any()
    columns_without_zero = contains_zero[~contains_zero].index
Noric Couderc's avatar
Noric Couderc committed
269
    ratios = 1 / features.get(columns_without_zero)
270
271
272
273
274
    features_with_inverses = features.join(ratios, rsuffix="_inverse")
    features_poly = poly_transformer.fit_transform(features_with_inverses)
    return features_poly


Noric Couderc's avatar
Noric Couderc committed
275
#%%
276
277


Noric Couderc's avatar
Noric Couderc committed
278
279
280
281
282
283
284
def normalize_data(data, selected_columns, normalizing_column):
    """
    Normalizes the features based on a column
    data: The dataFrame to process
    selected_columns: List of column names of columns to normalize
    normalizing_column: Name of the column to use for normalization
    """
285
286
287
288
289
    assert(data is not None)
    selected = data.get(selected_columns).astype(float)
    normalizing_col = data[normalizing_column].astype(float)
    assert(selected.shape[0] == normalizing_col.shape[0])
    normalized = selected.div(normalizing_col, axis="index")
Noric Couderc's avatar
Noric Couderc committed
290
    return normalized
291

Noric Couderc's avatar
Noric Couderc committed
292
#%%
293

Noric Couderc's avatar
Noric Couderc committed
294

295
296
297
298
299
def load_training_data(jmh_results_filename,
                       software_counters_filename,
                       hardware_counters_filename):
    """ A function that loads the training data for training JBrainy
        input: - The name of the file containing the JMH benchmark data
Noric Couderc's avatar
Noric Couderc committed
300
               - The name of the software counters file
301
302
303
304
305
306
307
308
309
310
311
312
313
               - The name of the hardware counters filename
        returns: A report containing the data
            and some metadata, like the selected SW columns and HW columns
            and the names of the data structures we will be dealing with
    """
    # Precondition: All the provided files must exist.
    for file in [jmh_results_filename,
                 software_counters_filename,
                 hardware_counters_filename]:
        if os.path.isfile(file):
            # OK.
            continue
        else:
314
            raise Exception("The path '{0}' does not exist".format(file))
315
316
317

    # Ok here we go
    jmh_with_best = load_jmh_data(jmh_results_filename)
318
    jmh_with_best["Ratio improvement"] = compute_sample_ratios(jmh_with_best)
Noric Couderc's avatar
Noric Couderc committed
319
    jmh_with_best["Sample weight"] = compute_sample_weights(jmh_with_best)
320
    software_data = load_software_counters(software_counters_filename)
Noric Couderc's avatar
Noric Couderc committed
321
    software_with_jmh = merge_jmh_software(jmh_with_best, software_data)
Noric Couderc's avatar
Noric Couderc committed
322
    software_selected_columns = list(software_data.columns)
Noric Couderc's avatar
Noric Couderc committed
323
    software_features = software_with_jmh.get(software_selected_columns)
324
    papi_data = load_hardware_counters(hardware_counters_filename)
Noric Couderc's avatar
Noric Couderc committed
325
326
327
328
329
330
331
332
    papi_data['size'] = jmh_with_best['Param: applicationSize']
    hardware_selected_columns = [c for c in papi_data.columns if "PAPI" in c]
    hardware_features = papi_data.get(hardware_selected_columns)
    # Removing `WeakHashMap` and `IdentityHashMap`
    # Since `WeakHashMap` and `IdentityHashMap` have different semantics, we remove
    # apps where they were the best choice and apps where they were used.
    software_plus_hardware = software_with_jmh.join(hardware_features)
    sw_hw_cleaned = remove_weakmap_hashmap(software_plus_hardware)
333
334
335
336
337
338
    return { "data" : sw_hw_cleaned,
             "data_structure_names" : software_plus_hardware["Param: datastructureName_best"].sort_values().unique(),
             "software_selected_columns": software_selected_columns,
             "hardware_selected_columns": hardware_selected_columns }


339
340
341
342
343
344
345
def prepare_features(dataframe, hw_selected_cols, sw_selected_cols):
    hw_normalized_data = normalize_data(dataframe,
                                        hw_selected_cols,
                                        "PAPI_TOT_INS")
    sw_normalized_data = normalize_data(dataframe,
                                        sw_selected_cols,
                                        "Param: applicationSize")
346

347
    return pd.concat([hw_normalized_data, sw_normalized_data], axis=1)
Noric Couderc's avatar
Noric Couderc committed
348

349

350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
def train_test_split(training_data, label_column, feature_columns, test_size=0.25):
    """
    Splits the training data into a training set and test set,
    making sure the test set contains only benchmarks that are /not/
    in the training data

    Returns (X_train, X_test, y_train, y_test)
    """
    assert "benchmark_id" in training_data.columns

    benchmarks = training_data["benchmark_id"].drop_duplicates()
    benchmark_filter = numpy.random.choice(a=[True, False], size=len(benchmarks),
                                           p=[test_size, 1 - test_size])

    test_benchmarks = benchmarks[benchmark_filter]
    training_benchmarks = benchmarks[~benchmark_filter]

    train_data = training_data.merge(training_benchmarks)
    test_data = training_data.merge(test_benchmarks)
    X_train = train_data.get(feature_columns)
    X_test = test_data.get(feature_columns)

    y_train = train_data.get(label_column)
    y_test = test_data.get(label_column)

    return (X_train, X_test, y_train, y_test)


378
def train_classifiers(training_data, create_classifier, label_column, feature_columns, grouping_column):
379
380
381
    """
    Splits the data by collection to replace and trains
    a classifier for each one of them.
382

383
    Returns a map from collection name to its associated classifier.
384

385
386
387
388
389
    training_data : The dataframe to use
    create_classifier : Function which creates a new classifier
    label_column : String for the name of the column to use as label
    features_columns : List of string for names of columns to use as features
    grouping_colum : String for the name of the column to use as grouping (one classifier per value)
390
    """
391
392
    assert label_column in training_data.columns
    assert grouping_column in training_data.columns
393
394

    for f_c in feature_columns:
395
        assert f_c in training_data.columns
396
397


398
    training_data_grouped = training_data.groupby(grouping_column)
399

400
    classifiers = {}
Noric Couderc's avatar
Noric Couderc committed
401

402
    for label, group in training_data_grouped:
Noric Couderc's avatar
Noric Couderc committed
403
404
405
        # Commented because JBrainy normalizes
        # features = prepare_features(group, hardware_selected_columns,
        #                             software_selected_columns)
406
407
        features = group.get(feature_columns)
        labels = group[label_column]
408
409
410
        X_train, X_test, y_train, y_test = train_test_split(group,
                                                            label_column,
                                                            feature_columns)
411
        # We train the classifier
412
        classifier = create_classifier()
413

414
        classifier.fit(X_train, y_train)
Noric Couderc's avatar
Noric Couderc committed
415

416
        # We just add it to the class
417
        classifier.feature_names = features.columns.values
418
419
        classifier.training_accuracy = classifier.score(X_train, y_train)
        classifier.test_accuracy = classifier.score(X_test, y_test)
420

421
422
        print("Accuracy for {0}: {1}".format(
            label,
423
            classifier.test_accuracy))
Noric Couderc's avatar
Noric Couderc committed
424

425
        classifiers[label] = classifier
Noric Couderc's avatar
Noric Couderc committed
426

427
    return classifiers
428

Noric Couderc's avatar
Noric Couderc committed
429

430
431
432
433
434
435
436
437
def benchmark_id(row):
    """
    Computes the benchmark ID for a row of JMH data
    """
    # Python is golden sometimes
    interfaces = { i + "ApplicationBenchmark" : i for i in ["List", "Map", "Set"]}

    return ":".join(["Synth",
438
                     str(row["Param: methodSelectionStrategyId"]),
439
440
441
442
443
444
445
                     str(row["Param: seed"]),
                     str(row["Param: applicationSize"]),
                     interfaces[row["Benchmark"].split(".")[4]],
                     str(row["Param: baseStructureSize"]),
                     str(row["Param: datastructureName"])])


446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
def is_feature_normalized(string):
    """
    Returns true if the string represents a number between 0 and 1
    Also returns true if the string is NOT a number (it's a text feature)
    """
    try:
        x = float(string)
        # In some cases, some features have a value slightly
        # higher than 1...
        return 0 <= x <= 1.5
    except ValueError as e:
        return True


def are_features_normalized(features_data):
    """
    Returns true if the features in the dataset are normalized
    """
    # The features are actually strings... even the numeric values
    return features_data["value"].apply(is_feature_normalized).all()


468
def prepare_long_form_data(jmh_data, features_data):
469
    """
470
    Takes features and jmh data to create data JBrainy can
471
472
    use for training
    The "features" data should be a long-form dataframe with columns
473
    (benchmark_id, iteration, feature, feature_type, value)
474
    """
475
    validate_jmh_data(jmh_data)
476
    validate_features_data(features_data, len(jmh_data))
477

478
479
480
481
482
483
484
    jmh_with_best = prepare_jmh_data(jmh_data)

    jmh_with_best["benchmark_id"] = jmh_with_best.apply(benchmark_id, axis=1)

    features_data_wide = pd.pivot(features_data,
                                  columns="feature",
                                  values="value",
485
                                  index=["benchmark_id", "iteration"]).reset_index().fillna(value=0)
486

487
    hardware_features = features_data[features_data["feature_type"] == "hardware"]
488
    hardware_selected_columns = list(hardware_features["feature"].drop_duplicates())
489

490
491
492
493
    software_features = features_data[features_data["feature_type"] == "software"]
    software_selected_columns = list(software_features["feature"].drop_duplicates())

    for m in (software_selected_columns + hardware_selected_columns):
494
495
496
497
        features_data_wide[m] = pd.to_numeric(features_data_wide[m])

    merged_table = pd.merge(jmh_with_best, features_data_wide, on="benchmark_id")

498
    # I think there should be one row for each iteration of each benchmark
499
    assert len(merged_table) == len(features_data.get(["benchmark_id", "iteration"]).drop_duplicates())
Noric Couderc's avatar
Noric Couderc committed
500

501
502
    # Return data that should replace the original data.
    return { "data" : merged_table,
503
             "software_selected_columns" : software_selected_columns,
504
505
506
             "hardware_selected_columns" : hardware_selected_columns }


507
508
509
510
511
512
513
514
515
516
def filter_data_five_percent(dataframe):
    """
    Filters out the lines where the ratio of improvement is less than 5%.
    """
    dataframe["Ratio improvement"] = compute_sample_ratios(dataframe)
    dataframe["Sample weight"] = compute_sample_weights(dataframe)

    return dataframe[dataframe["Sample weight"] > 0]


517
518
519
520
521
522
523
524
def get_file_metadata(filename):
    """
    Gets some important information about files, so we can store
    it with the classifier
    - name of file
    - hash of the file
    - date of creation
    """
525
526
    f = open(filename, 'rb')
    hash_file = hashlib.sha1(f.read())
527
528
529
    # Apparently, on Unix, this is actually the last time
    # The Inode was modified
    creation_t = os.path.getctime(filename)
530
    creation_dt = datetime.datetime.fromtimestamp(creation_t)
531
532
533

    return { "filename" : filename,
             "hash" : hash_file.hexdigest(),
534
             "creation-date" : creation_dt.isoformat() }
535
536
537
538
539
540


def get_current_commit():
    return subprocess.check_output(["git", "describe", "--always"]).strip()


541
542
543
544
545
546
def validate_jmh_data(jmh_data):
    expected_columns = ['Benchmark', 'Mode', 'Threads', 'Samples', 'Score',
       'Score Error (99.9%)', 'Unit', 'Param: applicationSize',
       'Param: baseStructureSize', 'Param: datastructureName',
       'Param: methodSelectionStrategyId', 'Param: seed']

547
    assert (jmh_data.columns.sort_values() == sorted(expected_columns)).all()
548
549


550
def validate_features_data(features_data, number_benchmarks):
551
    expected_columns = ['benchmark_id', 'iteration', 'feature', 'feature_type', 'value']
552
553
    assert (features_data.columns == expected_columns).all()

554
555
556
    # Set of benchmark IDs matched number of benchmarks
    assert (features_data["benchmark_id"].unique().size == number_benchmarks)

557

558
559
560
561
562
563
564
565
566
567
568
def get_number_discarded_samples(dataframe):
    return len(dataframe[dataframe["Sample weight"] == 0])


def get_discared_samples_ratio(dataframe):
    ratio = float(get_number_discarded_samples(dataframe)) / len(dataframe)
    assert ratio <= 0.5
    return ratio



569
570
571
572
573
574
575
576
577
578
579
580
581
582
def save_data(prefix, dataframe):
    """
    Saves the provided dataframe to a CSV file with the
    provided prefix.
    Appends the time of creation to the file name.
    """
    now = datetime.datetime.now().isoformat()
    data_filename = prefix + "-" + now + ".csv"

    dataframe.to_csv(data_filename)
    print("Data saved to: '{0}'".format(data_filename))



583
584
585
586
587
if __name__ == "__main__":
    # The script takes a number of files as input
    # Loads the JMH data, SW & HW performance counters
    # Merges them to get a well built data file
    # And train the classifier.
588

589
    #%%
Noric Couderc's avatar
Noric Couderc committed
590

591
    print("Loading training data...")
592
593
594
595
    # training_data = load_training_data("data/jmh-results-9307f70f.csv",
    #                                    "data/2020-03-18T13:17:45+01:00-software-counters.csv",
    #                                    "data/hardware-perf-data.csv")

596
597
    jmh_data_filename = sys.argv[1]
    features_data_filename = sys.argv[2]
598

599
600
601
602
603
    print("Arguments:", { "jmh-data" : jmh_data_filename,
                          "features-data" : features_data_filename})

    jmh_data = pd.read_csv(jmh_data_filename)
    features_data = pd.read_csv(features_data_filename)
604

605
606
607
608
    input_metadata = [get_file_metadata(f) for f in sys.argv]

    training_metadata = { "input-files" : input_metadata, "commit" : get_current_commit() }

609
610
    # De-activated because features we're "never" normalized?
    # assert are_features_normalized(features_data), "Features are not normalized!"
611

612
613
    training_data = prepare_long_form_data(jmh_data, features_data)

614
    print("Training data loaded")
615

616
    save_data("training-data", training_data["data"])
617

618
619
620
621
622
    training_data_filtered = { "software_selected_columns" : training_data["software_selected_columns"],
                               "hardware_selected_columns" : training_data["hardware_selected_columns"],
                               "data" : filter_data_five_percent(training_data["data"])}

    dropped_samples = len(training_data["data"]) - len(training_data_filtered["data"])
623
    print("Ratio of dropped samples: {0:.2f}"\
624
          .format(float(dropped_samples) / len(training_data["data"])))
625

626
    #%%
627

Noric Couderc's avatar
Noric Couderc committed
628
629

    classifier_creators = {"RandomForest" : lambda : RandomForestClassifier(),
630
                           "LDA": lambda : LinearDiscriminantAnalysis(n_components=2),
631
632
                           "ANN" : lambda : MLPClassifier(solver='lbfgs', alpha=1e-5, max_iter=1000,
                                                          hidden_layer_sizes=(5, 5),
633
634
                                                          random_state=1) }

635
    classifiers = train_classifiers(training_data["data"],
636
                                    classifier_creators["ANN"],
637
638
                                    "Best datastructure",
                                    training_data["software_selected_columns"] + training_data["hardware_selected_columns"],
639
                                    "Param: datastructureName")
Noric Couderc's avatar
Noric Couderc committed
640

641
642
643
    to_save = { "metadata" : training_metadata,
                "classifiers" : classifiers }

644
645
    filename = 'jbrainy-classifier.pickle'
    with open(filename, 'wb') as output_file:
646
        pickle.dump(to_save, output_file)
647

648
    print("Classifiers saved at: '{0}'".format(filename))
649

650
    #%%