Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
C
COM6001M Computer Science Major Project
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
benjamin.clough
COM6001M Computer Science Major Project
Commits
109d33b1
Commit
109d33b1
authored
Apr 17, 2023
by
Benjamin
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
OOPified many classes.
Exported some models to files
parent
ca7c93f5
Changes
8
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
370 additions
and
252 deletions
+370
-252
DL_classifiers.py
custom_models/classifiers/DL_classifiers.py
+31
-93
ML_classifiers.py
custom_models/classifiers/ML_classifiers.py
+17
-47
algorithmic_feature_extraction_selection.py
...on_extraction/algorithmic_feature_extraction_selection.py
+69
-31
main.py
main.py
+34
-5
ModelTemplates.py
project_utilities/ModelTemplates.py
+50
-0
evaluators.py
project_utilities/evaluators.py
+60
-5
model_interaction.py
project_utilities/model_interaction.py
+35
-0
my_datasets.py
project_utilities/my_datasets.py
+74
-71
No files found.
custom_models/classifiers/DL_classifiers.py
View file @
109d33b1
import
keras
import
numpy
import
pandas
as
pd
from
project_utilities.ModelTemplates
import
KerasDeepLearningModel
from
project_utilities.my_datasets
import
ITSupportDatasetBuilder
from
custom_models.feature_selection_extraction.algorithmic_feature_extraction_selection
import
TFIDF_Model
from
project_utilities.model_interaction
import
KerasModelFileInteraction
,
SKLearnModelFileInteraction
from
sklearn.preprocessing
import
LabelEncoder
from
keras.utils
import
to_categorical
import
custom_models.feature_selection_extraction.ML_DL_feature_extraction_selection
import
custom_models.feature_selection_extraction.algorithmic_feature_extraction_selection
from
project_utilities
import
my_datasets
from
sklearn.feature_extraction.text
import
CountVectorizer
,
TfidfVectorizer
from
sklearn.model_selection
import
train_test_split
from
keras.models
import
Sequential
from
keras
import
layers
from
keras.backend
import
clear_session
import
tensorflow
as
tf
from
keras
import
metrics
from
pandas
import
DataFrame
class
KerasCNN
(
KerasDeepLearningModel
):
def
__init__
(
self
,
model
=
None
):
super
()
.
__init__
()
def
train_model
(
self
,
vectors
,
labels
,
test_vectors
,
test_labels
,
epochs
=
50
,
batch_size
=
50
,
callbacks
=
None
):
if
not
callbacks
:
self
.
model
.
fit
(
vectors
,
labels
,
epochs
=
50
,
batch_size
=
50
,
validation_data
=
(
test_vectors
,
test_labels
),
)
self
.
model
.
fit
(
vectors
,
labels
,
epochs
=
50
,
batch_size
=
50
,
validation_data
=
(
test_vectors
,
test_labels
),
callbacks
=
[
callbacks
])
class
PresetSoftmaxClassifier
:
vectorized_dataset
=
DataFrame
classes
=
list
def
make_predictions
(
self
,
vectors
):
return
self
.
model
.
predict
(
vectors
)
def
__init__
(
self
,
vectorized_dataset
,
classes
:
list
):
self
.
vectorized_dataset
=
vectorized_dataset
self
.
classes
=
classes
if
__name__
==
'__main__'
:
dataset
=
my_datasets
.
ITSupportDatasetBuilder
()
\
if
__name__
==
"__main__"
:
dataset
=
ITSupportDatasetBuilder
()
\
.
with_summaries_and_descriptions_combined
()
\
.
with_overall_priority_column
()
\
.
with_pre_processed_descriptions
()
\
.
build
()
.
corpus
doc2vec_IT
=
custom_models
.
feature_selection_extraction
.
ML_DL_feature_extraction_selection
.
ITSupportDoc2VecImplementation
(
dataset
=
dataset
,
model_type
=
custom_models
.
feature_selection_extraction
.
ML_DL_feature_extraction_selection
.
Doc2VecModels
.
DBOW
)
# doc2vec_IT.pre_process_texts()
doc2vec_IT
.
tag_documents
()
doc2vec_IT
.
create_model
()
doc2vec_IT
.
build_vocabulary
()
doc2vec_IT
.
train_model
(
dataset_shuffles
=
1
,
epochs
=
10
)
# dataset_shuffles=10, epochs=30)
print
(
"Got here 0.5"
)
doc2vec_IT
.
generate_vectors
()
Z
=
tf
.
keras
.
utils
.
to_categorical
(
dataset
.
Priority
,
num_classes
=
5
)
print
(
Z
)
'''descriptions_train, descriptions_test, tfidf.training_labels, tfidf.testing_labels = train_test_split(
dataset.Descriptions, Z, test_size=0.3,
random_state=1000)
vectorizer.fit(descriptions_train)
tfidf.training_descriptions = vectorizer.transform(descriptions_train)
tfidf.testing_descriptions = vectorizer.transform(descriptions_test)'''
# tfidf.training_labels = tf.keras.utils.to_categorical(tfidf.training_labels, num_classes=5)
print
(
dataset
.
train_labels
)
# vectorizer.fit(tfidf.training_labels)
input_dim
=
dataset
.
training_descriptions
.
shape
[
1
]
#tfidf.training_descriptions.shape[1]
model
=
Sequential
()
model
.
add
(
layers
.
Dense
(
10
,
input_dim
=
input_dim
,
activation
=
'relu'
))
model
.
add
(
layers
.
Dense
(
5
,
activation
=
'softmax'
))
model
.
compile
(
loss
=
'binary_crossentropy'
,
optimizer
=
'adam'
,
metrics
=
[
metrics
.
Recall
()])
# model.summary()
history
=
model
.
fit
(
tfidf
.
training_descriptions
,
tfidf
.
training_labels
,
epochs
=
100
,
verbose
=
False
,
validation_data
=
(
tfidf
.
testing_descriptions
,
tfidf
.
testing_labels
),
batch_size
=
5
)
TFIDF_model
=
TFIDF_Model
()
TFIDF_model
.
from_file
(
'/custom_models/feature_selection_extraction/tfidf_model.joblib'
,
SKLearnModelFileInteraction
())
loss
,
accuracy
=
model
.
evaluate
(
tfidf
.
testing_descriptions
,
tfidf
.
testing_labels
,
verbose
=
False
)
print
(
"Testing Accuracy: {:.4f}"
.
format
(
accuracy
))
CNN_model
=
KerasCNN
(
)
CNN_model
.
from_file
(
'custom_models/classifiers/CNN_model.h5'
,
KerasModelFileInteraction
(
))
# matrix = metrics.confusion_matrix(tfidf.testing_labels.argmax(axis=1), y_prediction.argmax(axis=1
))
y_prediction
=
model
.
predict
(
tfidf
.
testing_descriptions
)
y_prediction
=
numpy
.
argmax
(
y_prediction
,
axis
=
1
)
tfidf
.
testing_labels
=
numpy
.
argmax
(
tfidf
.
testing_labels
,
axis
=
1
)
print
(
keras
.
metrics
.
categorical_accuracy
(
tfidf
.
testing_labels
,
y_predictio
n
))
# tf.keras.metrics.confusion_matrix(tfidf.testing_labels.argmax(axis=1), y_prediction.argmax(axis=1
))
vectorised_descriptions
=
TFIDF_model
.
vectorize_descriptions
(
dataset
[
'Description'
]
.
tolist
(
))
X_train
,
X_test
,
y_train
,
y_test
=
TFIDF_model
.
split_dataset
(
0.1
,
vectorised_descriptions
,
dataset
[
'Priority'
]
.
tolist
()
)
encoder
=
LabelEncoder
()
.
fit
([
'P5'
,
'P4'
,
'P3'
,
'P2'
,
'P1'
]
)
y_train
=
to_categorical
(
encoder
.
transform
(
y_trai
n
))
y_val
=
to_categorical
(
encoder
.
transform
(
y_test
))
# cm = ITSupportPriorityConfusionMatrixEvaluator(predictions=y_prediction, actual_values=tfidf.testing_labels, labels=['P1', 'P2', 'P3', 'P4', 'P5']
)
# clear_session(
)
encoded_predictions
=
CNN_model
.
make_predictions
(
X_test
)
decoded_predictions
=
encoder
.
inverse_transform
(
encoded_predictions
.
argmax
(
axis
=
1
)
)
# keras.metrics.confusion_matrix(tfidf.testing_labels, y_prediction)
'''from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(tfidf.training_descriptions, tfidf.training_labels)
score = classifier.score(tfidf.testing_descriptions, tfidf.testing_labels)'''
from
scikitplot.metrics
import
plot_confusion_matrix
import
matplotlib.pyplot
as
plt
fig
,
ax
=
plt
.
subplots
(
figsize
=
(
5
,
5
))
num_to_pnum
=
[
'P5'
,
'P4'
,
'P3'
,
'P2'
,
'P1'
]
tfidf
.
testing_labels_lab
=
[
num_to_pnum
[
x
]
for
x
in
tfidf
.
testing_labels
]
y_pred_lab
=
[
num_to_pnum
[
x
]
for
x
in
y_prediction
]
# print(tfidf.testing_labels_lab, type(tfidf.testing_labels))
# plot_confusion_matrix(tfidf.testing_labels_lab, y_pred_lab, ax=ax, labels=['P1', 'P2', 'P3', 'P4', 'P5'])
# plt.show()
from
project_utilities.evaluators
import
ITSupportPriorityConfusionMatrixEvaluator
cm
=
ITSupportPriorityConfusionMatrixEvaluator
(
predictions
=
y_pred_lab
,
actual_values
=
tfidf
.
testing_labels_lab
,
labels
=
[
'P1'
,
'P2'
,
'P3'
,
'P4'
,
'P5'
])
cm
.
plot_confusion_matrix
(
fullscreen_requested
=
True
)
custom_models/classifiers/ML_classifiers.py
View file @
109d33b1
from
enum
import
Enum
from
sklearn.linear_model
import
LogisticRegression
from
sklearn.naive_bayes
import
MultinomialNB
from
sklearn.svm
import
LinearSVC
from
sklearn.ensemble
import
RandomForestClassifier
from
project_utilities.ModelTemplates
import
SKLearnMachineLearningModel
class
ModelType
(
Enum
):
MULTINOMIAL_LOGISTIC_REGRESSION
=
1
MULTINOMIAL_NAIVE_BAYES
=
2
LINEAR_SUPPORT_VECTOR_CLASSIFICATION
=
3
RANDOM_FOREST
=
4
class
ITMultinomialLogisticRegression
(
SKLearnMachineLearningModel
):
def
__init__
(
self
,
inverse_regularisation_strength
:
float
=
1e5
,
cores_allocated
:
int
=
1
):
super
()
.
__init__
(
LogisticRegression
(
n_jobs
=
cores_allocated
,
C
=
inverse_regularisation_strength
,
multi_class
=
'multinomial'
,
solver
=
'newton-cg'
,
verbose
=
1
))
class
ITMachineLearningClassifierImplementation
:
cores_allocated
:
int
class
ITMultinomialNaiveBayes
(
SKLearnMachineLearningModel
):
def
__init__
(
self
):
super
()
.
__init__
(
MultinomialNB
())
def
__init__
(
self
,
vectors
,
labels
,
cores_allocated
:
int
=
1
)
->
None
:
self
.
model
=
None
self
.
cores_allocated
=
cores_allocated
self
.
vectors
=
vectors
self
.
labels
=
labels
def
use_preconfigured_model
(
self
,
preconfigured_model
):
self
.
model
=
preconfigured_model
class
ITSupportVectorClassifier
(
SKLearnMachineLearningModel
):
def
__init__
(
self
):
super
()
.
__init__
(
LinearSVC
())
def
train_model
(
self
):
self
.
model
.
fit
(
self
.
vectors
,
self
.
labels
)
def
make_predictions
(
self
,
items
):
return
self
.
model
.
predict
(
items
)
class
ITMultinomialLogisticRegression
(
ITMachineLearningClassifierImplementation
):
def
__init__
(
self
,
vectors
,
labels
,
inverse_regularisation_strength
:
float
,
cores_allocated
:
int
=
1
):
super
()
.
__init__
(
vectors
=
vectors
,
labels
=
labels
,
cores_allocated
=
cores_allocated
)
self
.
model
=
LogisticRegression
(
n_jobs
=
self
.
cores_allocated
,
C
=
inverse_regularisation_strength
,
multi_class
=
'multinomial'
,
solver
=
'newton-cg'
,
verbose
=
1
)
class
ITMultinomialNaiveBayes
(
ITMachineLearningClassifierImplementation
):
def
__init__
(
self
,
vectors
,
labels
):
super
()
.
__init__
(
vectors
,
labels
)
self
.
model
=
MultinomialNB
()
class
ITSupportVectorClassifier
(
ITMachineLearningClassifierImplementation
):
def
__init__
(
self
,
vectors
,
labels
):
super
()
.
__init__
(
vectors
,
labels
)
self
.
model
=
LinearSVC
()
class
ITRandomForestClassifier
(
ITMachineLearningClassifierImplementation
):
def
__init__
(
self
,
vectors
,
labels
,
tree_quantity
:
int
=
200
,
max_tree_depth
:
int
=
10
,
randomness
:
int
=
1
):
super
()
.
__init__
(
vectors
,
labels
)
RandomForestClassifier
(
n_estimators
=
tree_quantity
,
max_depth
=
max_tree_depth
,
random_state
=
randomness
)
class
ITRandomForestClassifier
(
SKLearnMachineLearningModel
):
def
__init__
(
self
,
tree_quantity
:
int
=
200
,
max_tree_depth
:
int
=
10
,
randomness
:
int
=
1
):
super
()
.
__init__
(
RandomForestClassifier
(
n_estimators
=
tree_quantity
,
max_depth
=
max_tree_depth
,
random_state
=
randomness
))
if
__name__
==
"__main__"
:
...
...
custom_models/feature_selection_extraction/algorithmic_feature_extraction_selection.py
View file @
109d33b1
...
...
@@ -9,7 +9,8 @@ from project_utilities import evaluators
import
pandas
import
numba
from
custom_models.classifiers
import
ML_classifiers
import
joblib
from
project_utilities
import
model_interaction
@
numba
.
jit
(
forceobj
=
1
)
...
...
@@ -19,31 +20,33 @@ def preprocess_corpus(corpus: pandas.DataFrame, *columns):
return
corpus
class
ITSupportTFIDFImplementation
:
class
TFIDF_Model
:
vectorizer
=
TfidfVectorizer
dataset
=
pandas
.
DataFrame
vectorized_descriptions
:
list
training_descriptions
=
\
testing_descriptions
=
\
training_labels
=
\
testing_labels
=
numpy
.
ndarray
def
__init__
(
self
,
dataset
:
pandas
.
DataFrame
):
tqdm
.
pandas
(
desc
=
"progress-bar"
)
def
__init__
(
self
):
self
.
vectorizer
=
TfidfVectorizer
(
max_features
=
10000
)
self
.
dataset
=
dataset
def
vectorize_descriptions
(
self
):
self
.
vectorized_descriptions
=
self
.
vectorizer
.
fit_transform
(
self
.
dataset
[
'Description'
]
.
values
)
.
toarray
()
def
fit_to_corpus
(
self
,
texts
):
self
.
vectorizer
.
fit
(
texts
)
def
from_file
(
self
,
filename
,
model_loader
:
model_interaction
.
SKLearnModelFileInteraction
):
self
.
vectorizer
=
model_loader
.
load_from_file
(
filename
)
def
split_dataset
(
self
,
percentage_testing
:
float
):
self
.
training_descriptions
,
self
.
testing_descriptions
,
self
.
training_labels
,
self
.
testing_labels
=
\
train_test_split
(
self
.
vectorized_descriptions
,
self
.
dataset
[
'Priority'
]
.
values
,
test_size
=
percentage_testing
,
random_state
=
1000
)
def
to_file
(
self
,
filename
):
joblib
.
dump
(
self
.
vectorizer
,
filename
)
def
vectorize_description
(
self
,
description
):
return
self
.
vectorizer
.
transform
([
description
])
.
toarray
()
def
Main
():
def
vectorize_descriptions
(
self
,
descriptions
):
return
self
.
vectorizer
.
transform
(
descriptions
)
.
toarray
()
@
staticmethod
def
split_dataset
(
percentage_testing
:
float
,
X
,
y
):
return
train_test_split
(
X
,
y
,
test_size
=
percentage_testing
,
random_state
=
1000
)
'''def Main():
# Get Dataset
dataset = my_datasets.ITSupportDatasetBuilder()
\
.with_summaries_and_descriptions_combined()
\
...
...
@@ -51,22 +54,57 @@ def Main():
.with_pre_processed_descriptions()
\
.build().corpus
tfidf
=
ITSupportTFIDFImplementation
(
dataset
)
tfidf
.
vectorize_descriptions
()
tfidf
.
split_dataset
(
0.1
)
tfidf = ITSupportTFIDFImplementation()
tfidf.fit_to_corpus(dataset['Description'].tolist())
vectorised_descriptions = tfidf.vectorize_descriptions(dataset['Description'].tolist())
# print(tfidf.vectorized_descriptions[0].shape)
X_train, X_test, y_train, y_test = tfidf.split_dataset(0.1, vectorised_descriptions, dataset['Priority'].tolist())
logreg
=
ML_classifiers
.
ITMultinomialLogisticRegression
(
vectors
=
tfidf
.
training_descriptions
,
labels
=
tfidf
.
training_labels
,
cores_allocated
=-
1
,
inverse_regularisation_strength
=
1e5
)
print
(
'Training Model'
)
logreg = ML_classifiers.ITMultinomialLogisticRegression()
logreg.use_preconfigured_model('logreg_model.joblib', model_interaction.SKLearnModelFileInteraction())
# logreg.use_preconfigured_model('logreg_model.joblib', model_interaction.SKLearnModelFileInteraction())
print('Training Model')
logreg.train_model()
label_predictions
=
logreg
.
make_predictions
(
tfidf
.
testing_descriptions
)
joblib.dump(logreg, "logreg_model.joblib")
print("finished!")
# print(X_train, X_test)
# logreg.train_model(X_train, y_train)
# logreg.save_model('logreg_model.joblib', model_interaction.SKLearnModelFileInteraction())
label_predictions = logreg.make_predictions(X_test)
print
(
'Made Predictions'
)
#classification_report(tfidf.testing_labels, label_predictions))
#
print('Made Predictions') #classification_report(tfidf.testing_labels, label_predictions))
labels = ['P5', 'P4', 'P3', 'P2', 'P1']
cm
=
evaluators
.
ITSupportPriorityConfusionMatrixEvaluator
(
label_predictions
,
tfidf
.
testing_labels
,
labels
)
from sklearn import metrics
# print(metrics.classification_report(y_test, label_predictions))
cm = evaluators.ITSupportPriorityConfusionMatrixEvaluator(label_predictions, y_test, labels)
cm.plot_confusion_matrix(fullscreen_requested=True)
# user_issue = input("Enter ticket desc: ")'''
if
__name__
==
'__main__'
:
Main
()
# Main()
# Get Dataset
'''dataset = my_datasets.ITSupportDatasetBuilder()
\
.with_summaries_and_descriptions_combined()
\
.with_overall_priority_column()
\
.with_pre_processed_descriptions()
\
.build().corpus
tfidf = ITSupportTFIDFImplementation()
tfidf.fit_to_corpus(dataset['Description'].tolist())
tfidf.to_file('tfidf_model.joblib')'''
# Get Dataset
'''dataset = my_datasets.ITSupportDatasetBuilder()
\
.with_summaries_and_descriptions_combined()
\
.with_overall_priority_column()
\
.with_pre_processed_descriptions()
\
.build().corpus
tfidf = ITSupportTFIDFImplementation(dataset)
tfidf.vectorize_descriptions()
logreg = joblib.load("logreg_model.joblib")
IT_issue = input("Enter IT issue to be prioritised: ")
preprocessed_input = tfidf.vectorize_description(IT_issue)
label_predictions = logreg.make_predictions(preprocessed_input)
print(label_predictions)'''
main.py
View file @
109d33b1
from
sklearn.preprocessing
import
LabelEncoder
from
keras.utils
import
to_categorical
from
custom_models.classifiers.DL_classifiers
import
KerasCNN
from
custom_models.feature_selection_extraction.algorithmic_feature_extraction_selection
import
TFIDF_Model
from
project_utilities.evaluators
import
DetailedConfusionMatrix
,
AccuracyPerClass
from
project_utilities.model_interaction
import
SKLearnModelFileInteraction
,
KerasModelFileInteraction
from
project_utilities.my_datasets
import
ITSupportDatasetBuilder
from
project_utilities
import
my_datasets
,
evaluators
from
custom_models.feature_selection_extraction
import
ML_DL_feature_extraction_selection
,
algorithmic_feature_extraction_selection
dataset
=
ITSupportDatasetBuilder
()
\
.
with_summaries_and_descriptions_combined
()
\
.
with_overall_priority_column
()
\
.
with_pre_processed_descriptions
()
\
.
build
()
.
corpus
TFIDF_model
=
TFIDF_Model
()
TFIDF_model
.
from_file
(
'custom_models/feature_selection_extraction/tfidf_model.joblib'
,
SKLearnModelFileInteraction
())
if
__name__
==
'__main__'
:
algorithmic_feature_extraction_selection
.
Main
()
CNN_model
=
KerasCNN
()
CNN_model
.
from_file
(
'custom_models/classifiers/CNN_model.h5'
,
KerasModelFileInteraction
())
vectorised_descriptions
=
TFIDF_model
.
vectorize_descriptions
(
dataset
[
'Description'
]
.
tolist
())
X_train
,
X_test
,
y_train
,
y_test
=
TFIDF_model
.
split_dataset
(
0.1
,
vectorised_descriptions
,
dataset
[
'Priority'
]
.
tolist
())
#vectorized_desc = TFIDF_Model.vectorize_description(self=TFIDF_model, description="WIFI network has lost connction across the whole campus, this needs fixing ASAP")
encoder
=
LabelEncoder
()
.
fit
([
'P5'
,
'P4'
,
'P3'
,
'P2'
,
'P1'
])
'''y_train = to_categorical(encoder.transform(y_train))
y_val = to_categorical(encoder.transform(y_test))'''
encoded_predictions
=
CNN_model
.
make_predictions
(
X_test
)
decoded_predictions
=
encoder
.
inverse_transform
(
encoded_predictions
.
argmax
(
axis
=
1
))
confusion_matrix
=
DetailedConfusionMatrix
(
decoded_predictions
,
y_test
,
[
'P5'
,
'P4'
,
'P3'
,
'P2'
,
'P1'
])
confusion_matrix
.
plot_confusion_matrix
(
fullscreen_requested
=
True
)
apc
=
AccuracyPerClass
(
decoded_predictions
,
y_test
,
[
'P5'
,
'P4'
,
'P3'
,
'P2'
,
'P1'
])
apc
.
plot_confusion_matrix
()
# See PyCharm help at https://www.jetbrains.com/help/pycharm/
project_utilities/ModelTemplates.py
0 → 100644
View file @
109d33b1
from
abc
import
ABC
,
abstractmethod
from
project_utilities
import
model_interaction
from
keras.models
import
load_model
class
SKLearnMachineLearningModel
(
ABC
):
cores_allocated
:
int
@
abstractmethod
def
__init__
(
self
,
model
=
None
):
self
.
model
=
model
def
use_preconfigured_model
(
self
,
filename
,
model_loader
:
model_interaction
.
SKLearnModelFileInteraction
):
self
.
model
=
model_loader
.
load_from_file
(
filename
)
def
save_model
(
self
,
filename
,
model_loader
:
model_interaction
.
SKLearnModelFileInteraction
):
model_loader
.
load_to_file
(
self
.
model
,
filename
)
def
train_model
(
self
,
vectors
,
labels
):
self
.
model
.
fit
(
vectors
,
labels
)
def
make_predictions
(
self
,
items
):
return
self
.
model
.
predict
(
items
)
class
KerasDeepLearningModel
(
ABC
):
@
abstractmethod
def
__init__
(
self
,
model
=
None
):
self
.
model
=
model
def
from_file
(
self
,
filename
,
model_loader
:
model_interaction
.
KerasModelFileInteraction
):
self
.
model
=
model_loader
.
load_from_file
(
filename
)
def
to_file
(
self
,
filename
,
model_loader
:
model_interaction
.
KerasModelFileInteraction
):
model_loader
.
load_to_file
(
self
.
model
,
filename
)
def
add_model_config
(
self
,
layer
):
self
.
model
.
add
(
layer
)
def
compile_model
(
self
,
loss_function
,
optimizer
,
*
metrics
):
self
.
model
.
compile
(
loss
=
loss_function
,
metrics
=
[
*
metrics
,
],
optimizer
=
optimizer
)
@
abstractmethod
def
train_model
(
self
,
vectors
,
labels
,
test_vectors
,
test_labels
,
epochs
,
batch_size
):
pass
@
abstractmethod
def
make_predictions
(
self
,
vectors
):
pass
project_utilities/evaluators.py
View file @
109d33b1
from
sklearn.metrics
import
confusion_matrix
from
seaborn
import
heatmap
from
matplotlib.pyplot
import
show
,
subplots
,
get_current_fig_manager
import
matplotlib.pyplot
as
plt
from
pandas
import
DataFrame
from
numpy
import
sum
as
numpy_sum
,
ndarray
,
empty_like
from
dataclasses
import
dataclass
global
CURRENT_FIGURES
class
ITSupportPriorityConfusionMatrixEvaluator
:
class
DetailedConfusionMatrix
:
"""Class for storing and showing a confusion matrix.
Adapted from https://www.kaggle.com/code/agungor2/various-confusion-matrix-plots/notebook"""
...
...
@@ -36,7 +39,7 @@ class ITSupportPriorityConfusionMatrixEvaluator:
dataset_confusion_matrix_data_frame
.
index
.
name
=
'Actual'
dataset_confusion_matrix_data_frame
.
columns
.
name
=
'Predicted'
label_quantity
=
len
(
self
.
labels
)
fig
,
ax
=
subplots
(
figsize
=
(
label_quantity
,
label_quantity
))
fig
,
ax
=
plt
.
subplots
(
figsize
=
(
label_quantity
,
label_quantity
))
# Adapted from https://stackoverflow.com/questions/42111075/seaborn-heatmap-color-scheme-based-on-row-values
normalised_confusion_matrix
=
dataset_confusion_matrix_data_frame
.
div
(
...
...
@@ -46,10 +49,10 @@ class ITSupportPriorityConfusionMatrixEvaluator:
# Adapted from https://stackoverflow.com/questions/12439588/how-to-maximize-a-plt-show-window-using-python
# (dinvlad)
if
fullscreen_requested
:
fig_manager
=
get_current_fig_manager
()
fig_manager
=
plt
.
get_current_fig_manager
()
fig_manager
.
window
.
state
(
'zoomed'
)
show
()
plt
.
show
()
def
__update_dataset_annotations
(
self
):
n_rows
,
n_columns
=
self
.
dataset_confusion_matrix
.
shape
...
...
@@ -65,4 +68,56 @@ class ITSupportPriorityConfusionMatrixEvaluator:
cell_percentage_of_category
,
cell_predicted_count
,
category_count
)
else
:
self
.
dataset_annotations
[
row
,
column
]
=
'
%
d
%%
\n
%
d/
%
d'
%
(
0
,
0
,
category_count
)
@
dataclass
class
AccuracyPerClass
:
label_predictions
:
list
actual_labels
:
list
label_classes
:
list
@
property
def
confusion_matrix
(
self
):
# Get the confusion matrix
cm
=
confusion_matrix
(
self
.
actual_labels
,
self
.
label_predictions
)
return
cm
def
sort_to_correct_incorrect_predictions
(
self
):
correct_predictions
=
{
'P5'
:
0
,
'P4'
:
0
,
'P3'
:
0
,
'P2'
:
0
,
'P1'
:
0
}
incorrect_predictions
=
{
'P5'
:
0
,
'P4'
:
0
,
'P3'
:
0
,
'P2'
:
0
,
'P1'
:
0
}
for
predicted
,
actual
in
zip
(
self
.
label_predictions
,
self
.
actual_labels
):
if
predicted
==
actual
:
correct_predictions
[
actual
]
+=
1
else
:
incorrect_predictions
[
actual
]
+=
1
return
list
(
correct_predictions
.
values
()),
list
(
incorrect_predictions
.
values
())
def
normalise_correct_incorrect
(
self
,
correct
,
incorrect
):
total_predictions_per_label
=
[
correct
[
label
]
+
incorrect
[
label
]
for
label
in
range
(
len
(
correct
))]
normalised_correct_predictions
=
[
correct
[
label
]
/
total_predictions_per_label
[
label
]
for
label
in
range
(
len
(
correct
))]
normalised_incorrect_predictions
=
[
incorrect
[
label
]
/
total_predictions_per_label
[
label
]
for
label
in
range
(
len
(
incorrect
))]
return
normalised_correct_predictions
,
normalised_incorrect_predictions
def
plot_confusion_matrix
(
self
):
"""
Adapted from firstly phind prompts:
1. How do you directly compare two confusion matrices
2. Generate a python script that shows percentage accuracy of 5 different classes
3. Can you generate the code to plot this with matplotlib
4. Make the script plot the percentage accuracy of each class
Then Bing AI GPT Prompts:
1. generate a python function that plots correct and incorrect for a specified number of classes
2. could you normalize each class so the bars are equal
:return: None
"""
# Create bar plot
labels
=
[
f
'{self.label_classes[i]}'
for
i
in
range
(
len
(
self
.
label_classes
))]
correct
,
incorrect
=
self
.
sort_to_correct_incorrect_predictions
()
normalised_correct
,
normalised_incorrect
=
self
.
normalise_correct_incorrect
(
correct
,
incorrect
)
width
=
0.35
fig
,
ax
=
plt
.
subplots
()
ax
.
bar
(
labels
,
normalised_correct
,
width
,
label
=
'Correct'
)
ax
.
bar
(
labels
,
normalised_incorrect
,
width
,
bottom
=
normalised_correct
,
label
=
'Incorrect'
)
ax
.
set_ylabel
(
'Correct - Incorrect Proportion'
)
ax
.
legend
()
plt
.
show
()
project_utilities/model_interaction.py
0 → 100644
View file @
109d33b1
from
abc
import
ABC
,
abstractmethod
import
joblib
from
keras.models
import
load_model
,
save_model
class
ModelFileInteraction
(
ABC
):
@
staticmethod
@
abstractmethod
def
load_from_file
(
filename
):
pass
@
staticmethod
@
abstractmethod
def
load_to_file
(
model
,
filename
):
pass
class
SKLearnModelFileInteraction
(
ModelFileInteraction
):
@
staticmethod
def
load_from_file
(
filename
):
return
joblib
.
load
(
filename
)
@
staticmethod
def
load_to_file
(
model
,
filename
):
joblib
.
dump
(
model
,
filename
)
class
KerasModelFileInteraction
(
ModelFileInteraction
):
@
staticmethod
def
load_from_file
(
filename
):
return
load_model
(
filename
)
@
staticmethod
def
load_to_file
(
model
,
filename
):
model
.
save
(
filename
)
project_utilities/my_datasets.py
View file @
109d33b1
from
pandas
import
read_csv
,
read_pickle
,
DataFrame
,
concat
from
pandas
import
read_csv
,
DataFrame
,
concat
from
dataclasses
import
dataclass
import
preprocessing_functionality
'''@dataclass
class ITSupportDataset:
"""Class for storing the IT Support Ticket Descriptions, Impacts, Urgencies, and Overall Priority"""
corpus = DataFrame
raw_dataset = DataFrame
def __init__(self, combined_title_description_requested: bool = False):
self.__get_raw_dataset()
self.__get_dataset(combined_title_description_requested)
self.__add_overall_priority_column()
def __get_raw_dataset(self):
self.raw_dataset = read_csv('C:
\\
Users
\\
Benjamin
\\
PycharmProjects
\\
DISSERTATION_ARTEFACT
\\
project_utilities'
'
\\
Datasets
\\
ITSupport_Tickets.csv')
#ticket_data_high_prio = read_csv('C:
\\
Users
\\
Benjamin
\\
PycharmProjects
\\
DISSERTATION_ARTEFACT
\\
project_utilities'
#'
\\
Datasets
\\
ITSupport_Tickets_High_Prio.csv')
#self.raw_dataset = ticket_data_low_prio
def __get_dataset(self, combined_title_description_requested: bool):
impacts = self.raw_dataset['Impact'].tolist()
urgencies = self.raw_dataset['Urgency'].tolist()
texts = self.raw_dataset['Description'].tolist()
if combined_title_description_requested:
summaries = self.raw_dataset['Incident_Summary'].tolist()
non_nulled_dataset = self.__remove_nulls_with_summaries(impacts, urgencies, texts, summaries)
else:
non_nulled_dataset = self.__remove_nulls(impacts, urgencies, texts)
self.corpus = DataFrame(non_nulled_dataset)
def __remove_nulls(self, impacts, urgencies, descriptions):
dict_corpus = {'Descriptions': [], 'Impacts': [], 'Urgencies': []}
for index in range(len(impacts)):
if not (impacts[index] is np.nan
or urgencies[index] is np.nan
or descriptions[index] is np.nan):
dict_corpus['Descriptions'].append(descriptions[index])
dict_corpus['Impacts'].append(impacts[index])
dict_corpus['Urgencies'].append(urgencies[index])
return dict_corpus
def __remove_nulls_with_summaries(self, impacts, urgencies, descriptions, summaries):
dict_corpus = {'Descriptions': [], 'Impacts': [], 'Urgencies': []}
for index in range(len(impacts)):
if not (impacts[index] is np.nan
or urgencies[index] is np.nan
or descriptions[index] is np.nan):
dict_corpus['Descriptions'].append(str(summaries[index]) + ' ' + str(descriptions[index]))
dict_corpus['Impacts'].append(impacts[index])
dict_corpus['Urgencies'].append(urgencies[index])
return dict_corpus
def __add_overall_priority_column(self):
prio_to_num = {'Low': 0, 'Medium': 1, 'High': 2}
num_to_pnum = ['P5', 'P4', 'P3', 'P2', 'P1']
pnums = []
for priorities in zip(self.corpus['Impacts'], self.corpus['Urgencies']):
numbered_priority = sum([prio_to_num[priorities[0]], prio_to_num[priorities[1]]])
pnums.append(num_to_pnum[numbered_priority])
self.corpus['Priorities'] = pnums'''
@
dataclass
class
ITSupportDatasetWithBuilder
:
...
...
@@ -81,12 +15,13 @@ class ITSupportDatasetWithBuilder:
def
__init__
(
self
):
self
.
__get_raw_dataset
()
self
.
__remove_nulls
()
#self.corpus = self.corpus.reset_index().drop_duplicates(subset='index', keep='first').set_index('index')
def
__get_raw_dataset
(
self
):
ticket_data_low_prio
=
read_csv
(
'/
\\
project_utilities'
'
\\
Datasets
\\
ITSupport_Tickets.csv'
)
ticket_data_high_prio
=
read_csv
(
'/
\\
project_utilities'
'
\\
Datasets
\\
ITSupport_Tickets_High_Prio.csv'
)
ticket_data_low_prio
=
read_csv
(
"C:
\\
Users
\\
Benjamin
\\
PycharmProjects
\\
DISSERTATION_ARTEFACT"
"
\\
project_utilities
\\
Datasets
\\
ITSupport_Tickets.csv"
)
ticket_data_high_prio
=
read_csv
(
"C:
\\
Users
\\
Benjamin
\\
PycharmProjects
\\
DISSERTATION_ARTEFACT"
"
\\
project_utilities
\\
Datasets
\\
ITSupport_Tickets_High_Prio.csv"
)
self
.
corpus
=
concat
([
ticket_data_low_prio
,
ticket_data_high_prio
])
def
combine_summaries_with_descriptions
(
self
):
...
...
@@ -143,6 +78,71 @@ class ITSupportDatasetBuilder(object):
return
self
.
_dataset
'''@dataclass
class ITSupportDataset:
"""Class for storing the IT Support Ticket Descriptions, Impacts, Urgencies, and Overall Priority"""
corpus = DataFrame
raw_dataset = DataFrame
def __init__(self, combined_title_description_requested: bool = False):
self.__get_raw_dataset()
self.__get_dataset(combined_title_description_requested)
self.__add_overall_priority_column()
def __get_raw_dataset(self):
self.raw_dataset = read_csv('C:
\\
Users
\\
Benjamin
\\
PycharmProjects
\\
DISSERTATION_ARTEFACT
\\
project_utilities'
'
\\
Datasets
\\
ITSupport_Tickets.csv')
#ticket_data_high_prio = read_csv('C:
\\
Users
\\
Benjamin
\\
PycharmProjects
\\
DISSERTATION_ARTEFACT
\\
project_utilities'
#'
\\
Datasets
\\
ITSupport_Tickets_High_Prio.csv')
#self.raw_dataset = ticket_data_low_prio
def __get_dataset(self, combined_title_description_requested: bool):
impacts = self.raw_dataset['Impact'].tolist()
urgencies = self.raw_dataset['Urgency'].tolist()
texts = self.raw_dataset['Description'].tolist()
if combined_title_description_requested:
summaries = self.raw_dataset['Incident_Summary'].tolist()
non_nulled_dataset = self.__remove_nulls_with_summaries(impacts, urgencies, texts, summaries)
else:
non_nulled_dataset = self.__remove_nulls(impacts, urgencies, texts)
self.corpus = DataFrame(non_nulled_dataset)
def __remove_nulls(self, impacts, urgencies, descriptions):
dict_corpus = {'Descriptions': [], 'Impacts': [], 'Urgencies': []}
for index in range(len(impacts)):
if not (impacts[index] is np.nan
or urgencies[index] is np.nan
or descriptions[index] is np.nan):
dict_corpus['Descriptions'].append(descriptions[index])
dict_corpus['Impacts'].append(impacts[index])
dict_corpus['Urgencies'].append(urgencies[index])
return dict_corpus
def __remove_nulls_with_summaries(self, impacts, urgencies, descriptions, summaries):
dict_corpus = {'Descriptions': [], 'Impacts': [], 'Urgencies': []}
for index in range(len(impacts)):
if not (impacts[index] is np.nan
or urgencies[index] is np.nan
or descriptions[index] is np.nan):
dict_corpus['Descriptions'].append(str(summaries[index]) + ' ' + str(descriptions[index]))
dict_corpus['Impacts'].append(impacts[index])
dict_corpus['Urgencies'].append(urgencies[index])
return dict_corpus
def __add_overall_priority_column(self):
prio_to_num = {'Low': 0, 'Medium': 1, 'High': 2}
num_to_pnum = ['P5', 'P4', 'P3', 'P2', 'P1']
pnums = []
for priorities in zip(self.corpus['Impacts'], self.corpus['Urgencies']):
numbered_priority = sum([prio_to_num[priorities[0]], prio_to_num[priorities[1]]])
pnums.append(num_to_pnum[numbered_priority])
self.corpus['Priorities'] = pnums'''
'''
#Previous method, more efficient, way more lines though
impacts = self.raw_dataset['Impact'].tolist()
...
...
@@ -189,3 +189,6 @@ if __name__ == '__main__':
corpus.to_pickle('corpus.pickle')'''
dataset
=
ITSupportDatasetBuilder
()
.
with_summaries_and_descriptions_combined
()
.
with_overall_priority_column
()
.
build
()
print
(
dataset
.
corpus
.
shape
)
dataset
.
corpus
=
dataset
.
corpus
.
reset_index
()
.
drop_duplicates
(
subset
=
'index'
,
keep
=
'first'
)
.
set_index
(
'index'
)
print
(
dataset
.
corpus
.
shape
)
print
(
dataset
.
corpus
.
loc
[
1
])
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment