Coverageability prediction
The replication package for the paper 'Learning to predict test effectiveness' published in International Journal of Intelligent Systems.
Goal
This script implements machine learning models for predicting the expected value of statement and branch coverage presented in International Journal of Intelligent Systems.
Machine learning models
- Model 1: DecisionTreeRegressor
- Model 2: RandomForestRegressor
- Model 3: GradientBoostingRegressor
- Model 4: HistGradientBoostingRegressor
- Model 5: SGDRegressor
- Model 6: MLPRegressor
Learning datasets
Dataset Applied preprocessing Number of metrics
-
DS1: (default) Simple classes elimination, data classes elimination, outliers elimination, and metric standardization 262
-
DS2: DS1 + Feature selection 20
-
DS3: DS1 + Context vector elimination 194
-
DS4: DS1 + Context vector elimination and lexical metrics elimination 177
-
DS5: DS1 + Systematically generated metrics elimination 71
-
DS6: Top 15 important source code metrics affecting Coverageability
Model dependent variable
E[C] = (1/2Statement coverage + 1/2Branch coverage) * b/|n|
Results
The results will be saved in sklearn_models6c
Inferences
Use the method inference_model2
of the class Regression
to predict testability of new Java classes
MultioutputClassification
https://scikit-learn.org/stable/modules/multiclass.html#multioutput-classification Multioutput-multiclass classification (also known as multitask classification)
Source code in adafest\code\testability\ml_models_coverageability.py
class MultioutputClassification:
"""
https://scikit-learn.org/stable/modules/multiclass.html#multioutput-classification
Multioutput-multiclass classification (also known as multitask classification)
"""
pass
Regression
Source code in adafest\code\testability\ml_models_coverageability.py
class Regression(object):
def __init__(self, df_path=r'dataset06/DS06013.csv', avg_type=None):
self.df = pd.read_csv(df_path, delimiter=',', index_col=False)
self.df['Label_Combine1'] = self.df['Label_Combine1'] * 0.01
self.df['Label_LineCoverage'] = self.df['Label_LineCoverage'] * 0.01
self.df['Label_BranchCoverage'] = self.df['Label_BranchCoverage'] * 0.01
self.df['Coverageability1'] = self.df['Coverageability1'] * 0.01
label_coverageability = self.df['Label_Combine1'] / self.df['Tests'] # (Arithmetic mean)
if avg_type is not None:
label_coverageability2 = list() # (Geometric mean)
label_coverageability3 = list() # (Harmonic mean)
for row in self.df.iterrows():
print(row[1][-3])
label_coverageability2.append(
(math.sqrt(row[1][-4] * row[1][-5])) / row[1][-3]
) # (Geometric mean)
label_coverageability3.append(
((2 * row[1][-4] * row[1][-5]) / (row[1][-4] + row[1][-5])) / row[1][-3]
) # (Harmonic mean)
label_coverageability2 = pd.DataFrame(label_coverageability2)
label_coverageability3 = pd.DataFrame(label_coverageability3)
# print('Before applying filter:', self.df.shape)
# self.df = self.df.loc[(self.df.Label_BranchCoverage <= 0.50)]
# self.df = self.df.loc[(self.df.Label_LineCoverage <= 0.50)]
# print('After applying filter:', self.df.shape)
# index -1: Coveragability1 (i.e., Testability)
# index -2: E[C] = 1/2 branch * line ==> models names: XXX1_DSX
# index -3: Test suite size
# index -4: BranchCoverage ==> model names: XXX2_DSX
# index -5: LineCoverage ==> model names: XXX3_DSX
self.X_train1, self.X_test1, self.y_train, self.y_test = train_test_split(
self.df.iloc[:, 1:-5],
# self.df.iloc[:, -2],
# label_coverageability,
self.df['Label_BranchCoverage'],
test_size=0.25,
random_state=42,
# stratify=self.df.iloc[:, -1]
)
"""
# ---------------------------------------
# -- Feature selection (For DS2)
selector = feature_selection.SelectKBest(feature_selection.f_regression, k=15)
# clf = linear_model.LassoCV(eps=1e-3, n_alphas=100, normalize=True, max_iter=5000, tol=1e-4)
# clf.fit(self.X_train1, self.y_train)
# importance = np.abs(clf.coef_)
# print('importance', importance)
# clf = RandomForestRegressor()
# selector = feature_selection.SelectFromModel(clf, prefit=False, norm_order=2, max_features=20, threshold=None)
selector.fit(self.X_train1, self.y_train)
# Get columns to keep and create new dataframe with only selected features
cols = selector.get_support(indices=True)
self.X_train1 = self.X_train1.iloc[:, cols]
self.X_test1 = self.X_test1.iloc[:, cols]
print('Selected columns by feature selection:', self.X_train1.columns)
# quit()
# -- End of feature selection
"""
# ---------------------------------------
# Standardization
self.scaler = preprocessing.RobustScaler(with_centering=True, with_scaling=True)
# self.scaler = preprocessing.StandardScaler()
self.scaler.fit(self.X_train1)
self.X_train = self.scaler.transform(self.X_train1)
self.X_test = self.scaler.transform(self.X_test1)
dump(self.scaler, 'DS06510.joblib')
# quit()
def inference_model(self, model=None, model_path=None):
if model is None:
model = joblib.load(model_path)
y_true, y_pred = self.y_test, model.predict(self.X_test[3:4, ])
print('X_test {0}'.format(self.X_test[3:4, ]))
print('------')
print('y_test or y_true {0}'.format(y_true[3:4, ]))
print('------')
print('y_pred by model {0}'.format(y_pred))
y_true, y_pred = self.y_test, model.predict(self.X_test)
df_new = pd.DataFrame(columns=self.df.columns)
for i, row in self.y_test.iteritems():
print('', i, row)
df_new = df_new.append(self.df.loc[i], ignore_index=True)
df_new['y_true'] = self.y_test.values
df_new['y_pred'] = list(y_pred)
df_new.to_csv(model_path[:-7] + '_inference_result.csv', index=True, index_label='Row')
def inference_model2(self, model=None, model_path=None, predict_data_path=None):
if model is None:
model = joblib.load(model_path)
df_predict_data = pd.read_csv(predict_data_path, delimiter=',', index_col=False)
X_test1 = df_predict_data.iloc[:, 1:]
X_test = self.scaler.transform(X_test1)
y_pred = model.predict(X_test)
df_new = pd.DataFrame(df_predict_data.iloc[:, 0], columns=['Class'])
df_new['PredictedTestability'] = list(y_pred)
print(df_new)
# df_new.to_csv(r'dataset06/refactored01010_predicted_testability.csv', index=True, index_label='Row')
def evaluate_model(self, model=None, model_path=None):
# X = self.data_frame.iloc[:, 1:-4]
# y = self.data_frame.iloc[:, -4]
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)
if model is None:
model = joblib.load(model_path)
y_true, y_pred = self.y_test, model.predict(self.X_test)
# y_score = model.predict_proba(X_test)
# Print all classifier model metrics
print('Evaluating regressor ...')
print('Regressor minimum prediction', min(y_pred), 'Regressor maximum prediction', max(y_pred))
df = pd.DataFrame()
df['r2_score_uniform_average'] = [r2_score(y_true, y_pred, multioutput='uniform_average')]
df['r2_score_variance_weighted'] = [r2_score(y_true, y_pred, multioutput='variance_weighted')]
df['explained_variance_score_uniform_average'] = [
explained_variance_score(y_true, y_pred, multioutput='uniform_average')]
df['explained_variance_score_variance_weighted'] = [
explained_variance_score(y_true, y_pred, multioutput='variance_weighted')]
df['mean_absolute_error'] = [mean_absolute_error(y_true, y_pred)]
df['mean_squared_error_MSE'] = [mean_squared_error(y_true, y_pred)]
df['mean_squared_error_RMSE'] = [mean_squared_error(y_true, y_pred, squared=False)]
df['median_absolute_error'] = [median_absolute_error(y_true, y_pred)]
if min(y_pred) >= 0:
df['mean_squared_log_error'] = [mean_squared_log_error(y_true, y_pred)]
# To handl ValueError: Mean Tweedie deviance error with power=2
# can only be used on strictly positive y and y_pred.
if min(y_pred > 0) and min(y_true) > 0:
df['mean_poisson_deviance'] = [mean_poisson_deviance(y_true, y_pred, )]
df['mean_gamma_deviance'] = [mean_gamma_deviance(y_true, y_pred, )]
df['max_error'] = [max_error(y_true, y_pred)]
df.to_csv(model_path[:-7] + '_evaluation_metrics_R1.csv', index=True, index_label='Row')
def evaluate_model_class(self, model=None, model_path=None):
if model is None:
model = joblib.load(model_path)
y_true, y_pred = self.y_test, model.predict(self.X_test)
df_new = pd.DataFrame(y_true)
df_new['y_pred'] = y_pred
testability_labels = ['VeryLow', 'Low', 'Moderate', 'High', 'VeryHigh']
testability_labels = ['Low', 'Moderate', 'High']
bins = [-1.250, 0.250, 0.750, 1.250]
# bins = 5
df_new['y_ture_nominal'] = pd.cut(df_new.loc[:, ['Coverageability1']].T.squeeze(),
bins=bins,
labels=testability_labels,
right=True
)
df_new['y_pred_nominal'] = pd.cut(df_new.loc[:, ['y_pred']].T.squeeze(),
bins=bins,
labels=testability_labels,
right=True
)
print(df_new)
# df_new.to_csv('XXXXX.csv')
y_true = df_new['y_ture_nominal']
y_pred = df_new['y_pred_nominal']
y_score = y_pred
# Print all classifier model metrics
print('Evaluating classifier ...')
df = pd.DataFrame()
print(y_pred)
try:
df['accuracy_score'] = [accuracy_score(y_true, y_pred)]
df['balanced_accuracy_score'] = [balanced_accuracy_score(y_true, y_pred)]
df['precision_score_macro'] = [precision_score(y_true, y_pred, average='macro')]
df['precision_score_micro'] = [precision_score(y_true, y_pred, average='micro')]
df['recall_score_macro'] = [recall_score(y_true, y_pred, average='macro')]
df['recall_score_micro'] = [recall_score(y_true, y_pred, average='micro')]
df['f1_score_macro'] = [f1_score(y_true, y_pred, average='macro')]
df['f1_score_micro'] = [f1_score(y_true, y_pred, average='micro')]
df['fbeta_score_macro'] = [fbeta_score(y_true, y_pred, beta=0.5, average='macro')]
df['fbeta_score_micro'] = [fbeta_score(y_true, y_pred, beta=0.5, average='micro')]
# df['log_loss'] = [log_loss(y_true, y_score)]
# df['roc_auc_score_ovr_macro'] = [roc_auc_score(y_true, y_score, multi_class='ovr', average='macro')]
# df['roc_auc_score_ovr_micro'] = [roc_auc_score(y_true, y_score, multi_class='ovr', average='weighted')]
# df['roc_auc_score_ovo_macro'] = [roc_auc_score(y_true, y_score, multi_class='ovo', average='macro')]
# df['roc_auc_score_ovo_micro'] = [roc_auc_score(y_true, y_score, multi_class='ovo', average='weighted')]
# print('roc_curve_:', roc_curve(y_true, y_score)) # multiclass format is not supported
df.to_csv(model_path[:-7] + '_evaluation_metrics_C.csv', index=True, index_label='Row')
except:
raise ValueError('The prediction is out of range')
def regress_with_decision_tree(self, model_path):
# X = self.data_frame.iloc[:, 1:-4]
# y = self.data_frame.iloc[:, -4]
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)
clf = tree.DecisionTreeRegressor()
# CrossValidation iterator object:
# https://scikit-learn.org/stable/tutorial/statistical_inference/model_selection.html
cv = ShuffleSplit(n_splits=5, test_size=0.25, random_state=42)
# Set the parameters to be used for tuning by cross-validation
parameters = {'max_depth': range(1, 100, 10),
'criterion': ['mse', 'friedman_mse', 'mae'],
'min_samples_split': range(2, 20, 1)
}
# Set the objectives which must be optimized during parameter tuning
# scoring = ['r2', 'neg_mean_squared_error', 'neg_root_mean_squared_error', 'neg_mean_absolute_error',]
scoring = ['neg_root_mean_squared_error', ]
# Find the best model using gird-search with cross-validation
clf = GridSearchCV(clf, param_grid=parameters, scoring=scoring, cv=cv, n_jobs=4,
refit='neg_root_mean_squared_error')
clf.fit(X=self.X_train, y=self.y_train)
print('Writing grid search result ...')
df = pd.DataFrame(clf.cv_results_, )
df.to_csv(model_path[:-7] + '_grid_search_cv_results.csv', index=False)
df = pd.DataFrame()
print('Best parameters set found on development set:', clf.best_params_)
df['best_parameters_development_set'] = [clf.best_params_]
print('Best classifier score on development set:', clf.best_score_)
df['best_score_development_set'] = [clf.best_score_]
print('best classifier score on test set:', clf.score(self.X_test, self.y_test))
df['best_score_test_set:'] = [clf.score(self.X_test, self.y_test)]
df.to_csv(model_path[:-7] + '_grid_search_cv_results_best.csv', index=False)
# Save and evaluate the best obtained model
print('Writing evaluation result ...')
clf = clf.best_estimator_
y_true, y_pred = self.y_test, clf.predict(self.X_test)
dump(clf, model_path)
self.evaluate_model(model=clf, model_path=model_path)
# Plots
# tree.plot_tree(clf)
# plt.show()
def regress(self, model_path: str = None, model_number: int = None):
"""
:param model_path:
:param model_number: 1: DTR, 2: RFR, 3: GBR, 4: HGBR, 5: SGDR, 6: MLPR,
:return:
"""
regressor = None
parameters = None
if model_number == 1:
regressor = tree.DecisionTreeRegressor(random_state=42, )
# Set the parameters to be used for tuning by cross-validation
parameters = {
# 'criterion': ['mse', 'friedman_mse', 'mae'],
'max_depth': range(3, 50, 5),
'min_samples_split': range(2, 30, 2)
}
elif model_number == 2:
regressor = RandomForestRegressor(random_state=42, )
parameters = {
'n_estimators': range(100, 200, 100),
# 'criterion': ['mse', 'mae'],
'max_depth': range(10, 50, 10),
# 'min_samples_split': range(2, 30, 2),
# 'max_features': ['auto', 'sqrt', 'log2']
}
elif model_number == 3:
regressor = GradientBoostingRegressor(n_estimators=400, learning_rate=0.05, random_state=42, )
parameters = {
# 'loss': ['ls', 'lad', ],
'max_depth': range(10, 50, 10),
'min_samples_split': range(2, 30, 3)
}
elif model_number == 4:
regressor = HistGradientBoostingRegressor(max_iter=400, learning_rate=0.05, random_state=42, )
parameters = {
# 'loss': ['least_squares', 'least_absolute_deviation'],
'max_depth': range(10, 50, 10),
'min_samples_leaf': range(5, 50, 10)
}
elif model_number == 5:
regressor = linear_model.SGDRegressor(early_stopping=True, n_iter_no_change=5, random_state=42, )
parameters = {
'loss': ['squared_loss', 'huber', 'epsilon_insensitive'],
'penalty': ['l2', 'l1', 'elasticnet'],
'max_iter': range(50, 1000, 50),
'learning_rate': ['invscaling', 'optimal', 'constant', 'adaptive'],
'eta0': [0.1, 0.01],
'average': [32, ]
}
elif model_number == 6:
regressor = MLPRegressor(random_state=42, )
parameters = {
'hidden_layer_sizes': [(256, 100), (512, 256, 100), ],
'activation': ['tanh', ],
'solver': ['adam', ],
'max_iter': range(50, 200, 50)
}
if regressor is None:
return
if parameters is None:
return
# Set the objectives which must be optimized during parameter tuning
# scoring = ['r2', 'neg_mean_squared_error', 'neg_root_mean_squared_error', 'neg_mean_absolute_error',]
scoring = ['neg_root_mean_squared_error', ]
# CrossValidation iterator object:
# https://scikit-learn.org/stable/tutorial/statistical_inference/model_selection.html
cv = ShuffleSplit(n_splits=5, test_size=0.25, random_state=42)
# Find the best model using gird-search with cross-validation
clf = GridSearchCV(regressor, param_grid=parameters, scoring=scoring, cv=cv, n_jobs=4,
refit='neg_root_mean_squared_error')
print('fitting model number', model_number)
clf.fit(X=self.X_train, y=self.y_train)
print('Writing grid search result ...')
df = pd.DataFrame(clf.cv_results_, )
df.to_csv(model_path[:-7] + '_grid_search_cv_results.csv', index=False)
df = pd.DataFrame()
print('Best parameters set found on development set:', clf.best_params_)
df['best_parameters_development_set'] = [clf.best_params_]
print('Best classifier score on development set:', clf.best_score_)
df['best_score_development_set'] = [clf.best_score_]
print('best classifier score on test set:', clf.score(self.X_test, self.y_test))
df['best_score_test_set:'] = [clf.score(self.X_test, self.y_test)]
df.to_csv(model_path[:-7] + '_grid_search_cv_results_best.csv', index=False)
# Save and evaluate the best obtained model
print('Writing evaluation result ...')
clf = clf.best_estimator_
y_true, y_pred = self.y_test, clf.predict(self.X_test)
dump(clf, model_path)
self.evaluate_model(model=clf, model_path=model_path)
# self.evaluate_model_class(model=clf, model_path=model_path)
# self.inference_model(model=clf, model_path=model_path)
print('=' * 75)
def vote(self, model_path=None, dataset_number=1):
# Trained regressors
reg1 = load(r'sklearn_models6c/branch/HGBR6_DS{0}.joblib'.format(dataset_number))
reg2 = load(r'sklearn_models6c/branch/RFR6_DS{0}.joblib'.format(dataset_number))
reg3 = load(r'sklearn_models6c/branch/MLPR6_DS{0}.joblib'.format(dataset_number))
# reg4 = load(r'sklearn_models6/SGDR1_DS1.joblib')
ereg = VotingRegressor([('HGBR6_DS{0}'.format(dataset_number), reg1),
('RFR6_DS{0}'.format(dataset_number), reg2),
('MLPR6_DS{0}'.format(dataset_number), reg3)
],
weights=[3. / 6., 2. / 6., 1. / 6.])
ereg.fit(self.X_train, self.y_train)
dump(ereg, model_path)
self.evaluate_model(model=ereg, model_path=model_path)
try:
self.evaluate_model_class(model=ereg, model_path=model_path)
except:
print('Prediction is out of the range.')
regress(self, model_path=None, model_number=None)
:param model_path: :param model_number: 1: DTR, 2: RFR, 3: GBR, 4: HGBR, 5: SGDR, 6: MLPR, :return:
Source code in adafest\code\testability\ml_models_coverageability.py
def regress(self, model_path: str = None, model_number: int = None):
"""
:param model_path:
:param model_number: 1: DTR, 2: RFR, 3: GBR, 4: HGBR, 5: SGDR, 6: MLPR,
:return:
"""
regressor = None
parameters = None
if model_number == 1:
regressor = tree.DecisionTreeRegressor(random_state=42, )
# Set the parameters to be used for tuning by cross-validation
parameters = {
# 'criterion': ['mse', 'friedman_mse', 'mae'],
'max_depth': range(3, 50, 5),
'min_samples_split': range(2, 30, 2)
}
elif model_number == 2:
regressor = RandomForestRegressor(random_state=42, )
parameters = {
'n_estimators': range(100, 200, 100),
# 'criterion': ['mse', 'mae'],
'max_depth': range(10, 50, 10),
# 'min_samples_split': range(2, 30, 2),
# 'max_features': ['auto', 'sqrt', 'log2']
}
elif model_number == 3:
regressor = GradientBoostingRegressor(n_estimators=400, learning_rate=0.05, random_state=42, )
parameters = {
# 'loss': ['ls', 'lad', ],
'max_depth': range(10, 50, 10),
'min_samples_split': range(2, 30, 3)
}
elif model_number == 4:
regressor = HistGradientBoostingRegressor(max_iter=400, learning_rate=0.05, random_state=42, )
parameters = {
# 'loss': ['least_squares', 'least_absolute_deviation'],
'max_depth': range(10, 50, 10),
'min_samples_leaf': range(5, 50, 10)
}
elif model_number == 5:
regressor = linear_model.SGDRegressor(early_stopping=True, n_iter_no_change=5, random_state=42, )
parameters = {
'loss': ['squared_loss', 'huber', 'epsilon_insensitive'],
'penalty': ['l2', 'l1', 'elasticnet'],
'max_iter': range(50, 1000, 50),
'learning_rate': ['invscaling', 'optimal', 'constant', 'adaptive'],
'eta0': [0.1, 0.01],
'average': [32, ]
}
elif model_number == 6:
regressor = MLPRegressor(random_state=42, )
parameters = {
'hidden_layer_sizes': [(256, 100), (512, 256, 100), ],
'activation': ['tanh', ],
'solver': ['adam', ],
'max_iter': range(50, 200, 50)
}
if regressor is None:
return
if parameters is None:
return
# Set the objectives which must be optimized during parameter tuning
# scoring = ['r2', 'neg_mean_squared_error', 'neg_root_mean_squared_error', 'neg_mean_absolute_error',]
scoring = ['neg_root_mean_squared_error', ]
# CrossValidation iterator object:
# https://scikit-learn.org/stable/tutorial/statistical_inference/model_selection.html
cv = ShuffleSplit(n_splits=5, test_size=0.25, random_state=42)
# Find the best model using gird-search with cross-validation
clf = GridSearchCV(regressor, param_grid=parameters, scoring=scoring, cv=cv, n_jobs=4,
refit='neg_root_mean_squared_error')
print('fitting model number', model_number)
clf.fit(X=self.X_train, y=self.y_train)
print('Writing grid search result ...')
df = pd.DataFrame(clf.cv_results_, )
df.to_csv(model_path[:-7] + '_grid_search_cv_results.csv', index=False)
df = pd.DataFrame()
print('Best parameters set found on development set:', clf.best_params_)
df['best_parameters_development_set'] = [clf.best_params_]
print('Best classifier score on development set:', clf.best_score_)
df['best_score_development_set'] = [clf.best_score_]
print('best classifier score on test set:', clf.score(self.X_test, self.y_test))
df['best_score_test_set:'] = [clf.score(self.X_test, self.y_test)]
df.to_csv(model_path[:-7] + '_grid_search_cv_results_best.csv', index=False)
# Save and evaluate the best obtained model
print('Writing evaluation result ...')
clf = clf.best_estimator_
y_true, y_pred = self.y_test, clf.predict(self.X_test)
dump(clf, model_path)
self.evaluate_model(model=clf, model_path=model_path)
# self.evaluate_model_class(model=clf, model_path=model_path)
# self.inference_model(model=clf, model_path=model_path)
print('=' * 75)
create_coverageability_dataset_with_only_important_metrics()
Create DS#6 (DS06610) For use in Mr Esmaeili project Select only top 15 important Coverageability features :return:
Source code in adafest\code\testability\ml_models_coverageability.py
def create_coverageability_dataset_with_only_important_metrics():
"""
Create DS#6 (DS06610)
For use in Mr Esmaeili project
Select only top 15 important Coverageability features
:return:
"""
df_path = r'dataset06/DS06013.csv'
df_important_metrics_path = r'dataset06/DS06610.csv'
df = pd.read_csv(df_path, delimiter=',', index_col=False)
df_imp = pd.DataFrame()
df_imp['Class'] = df['Class'] # 0
df_imp['CSORD_SumCyclomaticStrict'] = df['CSORD_SumCyclomaticStrict'] # 1
df_imp['CSLEX_NumberOfConditionalJumpStatements'] = df['CSLEX_NumberOfConditionalJumpStatements'] # 2
df_imp['CSORD_LogCyclomaticStrict'] = df['CSORD_LogCyclomaticStrict'] # 3
df_imp['CSORD_CSNOMNAMM'] = df['CSORD_CSNOMNAMM'] # 4
df_imp['CSORD_NIM'] = df['CSORD_NIM'] # 5
df_imp['CSORD_LogStmtDecl'] = df['CSORD_LogStmtDecl'] # 6
df_imp['CSORD_CountDeclMethodPrivate'] = df['CSORD_CountDeclMethodPrivate'] # 7
df_imp['CSORD_CountDeclClassMethod'] = df['CSORD_CountDeclClassMethod'] # 8
df_imp['CSORD_NumberOfClassConstructors'] = df['CSORD_NumberOfClassConstructors'] # 9
df_imp['CSORD_MinLineCode'] = df['CSORD_MinLineCode'] # 10
df_imp['CSORD_SumCyclomatic'] = df['CSORD_SumCyclomatic'] # 11
df_imp['CSLEX_NumberOfReturnAndPrintStatements'] = df['CSLEX_NumberOfReturnAndPrintStatements'] # 12
df_imp['CSORD_MaxInheritanceTree'] = df['CSORD_MaxInheritanceTree'] # 13
df_imp['CSLEX_NumberOfIdentifies'] = df['CSLEX_NumberOfIdentifies'] # 14
df_imp['CSORD_CountDeclMethodPublic'] = df['CSORD_CountDeclMethodPublic'] # 15
# Runtime metrics
df_imp['Label_Combine1'] = df['Label_Combine1']
df_imp['Label_LineCoverage'] = df['Label_LineCoverage']
df_imp['Label_BranchCoverage'] = df['Label_BranchCoverage']
df_imp['Coverageability1'] = df['Coverageability1']
df_imp['Tests'] = df['Tests']
df_imp.to_csv(df_important_metrics_path, index=False)
train_on_ds6()
To be used for predict expected value of statement and branch coverage. index -1: Coveragability1 (i.e., Testability) index -2: E[C] = 0.5branch + 0.5line (Arithmetic mean) ==> models names: XXX1_DSX index -3: Test suite size index -4: BranchCoverage ==> model names: XXX2_DSX index -5: LineCoverage ==> model names: XXX3_DSX index new_col1: Coverageability (Arithmetic mean) ==> model names: XXX4_DSX index new_col2: Coverageability2 (Geometric mean) ==> model names: XXX5_DSX index new_col3: Coverageability3 (Harmonic mean) ==> model names: XXX6_DSX
Source code in adafest\code\testability\ml_models_coverageability.py
def train_on_ds6():
"""
To be used for predict expected value of statement and branch coverage.
index -1: Coveragability1 (i.e., Testability)
index -2: E[C] = 0.5*branch + 0.5*line (Arithmetic mean) ==> models names: XXX1_DSX
index -3: Test suite size
index -4: BranchCoverage ==> model names: XXX2_DSX
index -5: LineCoverage ==> model names: XXX3_DSX
index new_col1: Coverageability (Arithmetic mean) ==> model names: XXX4_DSX
index new_col2: Coverageability2 (Geometric mean) ==> model names: XXX5_DSX
index new_col3: Coverageability3 (Harmonic mean) ==> model names: XXX6_DSX
Returns:
"""
# DS1
# reg = Regression(df_path=r'dataset06/DS06013.csv')
# reg.regress(model_path=r'sklearn_models6c/DTR1_DS1.joblib', model_number=1)
# reg.regress(model_path=r'sklearn_models6c/coveragability3/RFR6_DS1.joblib', model_number=2)
# reg.regress(model_path=r'sklearn_models6c/GBR1_DS1.joblib', model_number=3)
# reg.regress(model_path=r'sklearn_models6c/coveragability3/HGBR6_DS1.joblib', model_number=4)
# reg.regress(model_path=r'sklearn_models6c/coveragability3/SGDR6_DS1.joblib', model_number=5)
# reg.regress(model_path=r'sklearn_models6c/statement/MLPR3_DS1.joblib', model_number=6)
# reg.vote(model_path=r'sklearn_models6c/statement/VR3_DS1.joblib', dataset_number=1)
# reg.evaluate_model(model_path=r'sklearn_models6/HGBR1_DS1.joblib',)
# reg.inference_model2(model_path=r'sklearn_models6/VR1_DS1.joblib',
# predict_data_path=r'dataset06/refactored01010.csv')
# reg.inference_model2(model_path=r'sklearn_models6/VR1_DS1.joblib',
# predict_data_path=r'D:/IdeaProjects/10_water-simulator/site_1/metrics1_1.csv')
# quit()
# DS 1/2
# reg.regress(model_path=r'sklearn_models6c/DTR1_DS2.joblib', model_number=1)
# reg.regress(model_path=r'sklearn_models6c/coveragability3/RFR6_DS2.joblib', model_number=2)
# reg.regress(model_path=r'sklearn_models6c/GBR1_DS2.joblib', model_number=3)
# reg.regress(model_path=r'sklearn_models6c/coveragability3/HGBR6_DS2.joblib', model_number=4)
# reg.regress(model_path=r'sklearn_models6c/coveragability3/SGDR6_DS2.joblib', model_number=5)
# reg.regress(model_path=r'sklearn_models6c/coveragability3/MLPR6_DS2.joblib', model_number=6)
# reg.vote(model_path=r'sklearn_models6c/coveragability3/VR6_DS2.joblib', dataset_number=2)
# quit()
# DS 3
# reg = Regression(df_path=r'dataset06/DS06310.csv')
# reg.regress(model_path=r'sklearn_models6c/DTR1_DS3.joblib', model_number=1)
# reg.regress(model_path=r'sklearn_models6c/coveragability3/RFR6_DS3.joblib', model_number=2)
# reg.regress(model_path=r'sklearn_models6c/GBR1_DS3.joblib', model_number=3)
# reg.regress(model_path=r'sklearn_models6c/coveragability3/HGBR6_DS3.joblib', model_number=4)
# reg.regress(model_path=r'sklearn_models6c/coveragability3/SGDR6_DS3.joblib', model_number=5)
# reg.regress(model_path=r'sklearn_models6c/statement/MLPR3_DS3.joblib', model_number=6)
# reg.vote(model_path=r'sklearn_models6c/statement/VR3_DS3.joblib', dataset_number=3)
# DS 4
# reg = Regression(df_path=r'dataset06/DS06410.csv')
# reg.regress(model_path=r'sklearn_models6c/DTR1_DS4.joblib', model_number=1)
# reg.regress(model_path=r'sklearn_models6c/coveragability3/RFR6_DS4.joblib', model_number=2)
# reg.regress(model_path=r'sklearn_models6c/GBR1_DS4.joblib', model_number=3)
# reg.regress(model_path=r'sklearn_models6c/coveragability3/HGBR6_DS4.joblib', model_number=4)
# reg.regress(model_path=r'sklearn_models6c/coveragability3/SGDR6_DS4.joblib', model_number=5)
# reg.regress(model_path=r'sklearn_models6c/statement/MLPR3_DS4.joblib', model_number=6)
# reg.vote(model_path=r'sklearn_models6c/statement/VR3_DS4.joblib', dataset_number=4)
# DS5
reg = Regression(df_path=r'dataset06/DS06510.csv')
# reg.regress(model_path=r'sklearn_models6c/branch/DTR6_DS5.joblib', model_number=1)
reg.regress(model_path=r'sklearn_models6c/branch/RFR6_DS5.joblib', model_number=2)
# reg.regress(model_path=r'sklearn_models6c/branch/GBR6_DS5.joblib', model_number=3)
reg.regress(model_path=r'sklearn_models6c/branch/HGBR6_DS5.joblib', model_number=4)
reg.regress(model_path=r'sklearn_models6c/branch/SGDR6_DS5.joblib', model_number=5)
reg.regress(model_path=r'sklearn_models6c/branch/MLPR6_DS5.joblib', model_number=6)
reg.vote(model_path=r'sklearn_models6c/branch/VR6_DS5.joblib', dataset_number=5)
# quit()
# Added for Mr. Esmaeily work
# DS6 (important metrics)
df_important_metrics_path = r'dataset06/DS06610.csv'
reg = Regression(df_path=df_important_metrics_path)
# reg.regress(model_path=r'sklearn_models6c/coveragability_arithmetic_mean/DTR6_DS6.joblib', model_number=1)
# reg.regress(model_path=r'sklearn_models6c/coveragability_arithmetic_mean/RFR6_DS6.joblib', model_number=2)
# reg.regress(model_path=r'sklearn_models6c/coveragability_arithmetic_mean/GBR6_DS6.joblib', model_number=3)
# reg.regress(model_path=r'sklearn_models6c/coveragability_arithmetic_mean/HGBR6_DS6.joblib', model_number=4)
# reg.regress(model_path=r'sklearn_models6c/coveragability_arithmetic_mean/SGDR6_DS6.joblib', model_number=5)
# reg.regress(model_path=r'sklearn_models6c/coveragability_arithmetic_mean/MLPR6_DS6.joblib', model_number=6)
# reg.vote(model_path=r'sklearn_models6c/coveragability_arithmetic_mean/VR6_DS6.joblib', dataset_number=6)
model_path = r'sklearn_models6c/coveragability/VR4_DS3.joblib'
scoring = ['r2', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_median_absolute_error']
n_repeat = [10, 20, 30, 40, 50]
for score in scoring:
for r in n_repeat:
compute_permutation_importance(model_path=model_path, scoring=score, n_repeats=r, )