Commit e689e130 authored by benjamin.clough's avatar benjamin.clough

Almost final commit!

parent 1d16a38d
import numpy as np
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical
......@@ -5,7 +6,7 @@ import custom_models.classifiers.ML_classifiers
from custom_models.classifiers.DL_classifiers import KerasCNN
from custom_models.feature_selection_extraction.algorithmic_feature_extraction_selection import TFIDF_Model
from custom_models.feature_selection_extraction.ML_DL_feature_extraction_selection import ITSupportDoc2VecImplementation, Doc2VecModels
from project_utilities.evaluators import DetailedConfusionMatrix, AccuracyPerClass
from project_utilities.evaluators import DetailedConfusionMatrix, AccuracyPerClass, print_evaluation_metrics
from project_utilities.model_interaction import SKLearnModelFileInteraction, KerasModelFileInteraction, GensimWordEmbeddingModelFileInteraction
from project_utilities import predictionformats
from project_utilities.my_datasets import ITSupportDatasetBuilder
......@@ -13,8 +14,10 @@ from projectsettings import DefaultConfig
import pandas as pd
# Load Dataset
dataset = ITSupportDatasetBuilder(f"{DefaultConfig.absolute_project_root_path()}/project_utilities/Datasets/ITSupport_Tickets.csv",
dataset = ITSupportDatasetBuilder(
f"{DefaultConfig.absolute_project_root_path()}/project_utilities/Datasets/ITSupport_Tickets.csv",
f"{DefaultConfig.absolute_project_root_path()}/project_utilities/Datasets/ITSupport_Tickets_High_Prio.csv",
f"{DefaultConfig.absolute_project_root_path()}/project_utilities/Datasets/synonym_IT_tickets.csv") \
.with_summaries_and_descriptions_combined() \
......@@ -22,39 +25,45 @@ dataset = ITSupportDatasetBuilder(f"{DefaultConfig.absolute_project_root_path()}
.with_pre_processed_descriptions() \
.build().corpus
# Split dataset into test and train
X_train_str, X_test_str, y_train, y_test = TFIDF_Model.split_dataset(0.1, dataset['Description'].tolist(),
dataset['Priority'].tolist())
# Get pre-configured doc2vec model
doc2vec_model = ITSupportDoc2VecImplementation(Doc2VecModels.DBOW)
'''doc2vec_model.from_file(
doc2vec_model.from_file(
f"{DefaultConfig.absolute_project_root_path()}/custom_models/preconfigured_models/doc2vec_model.model",
GensimWordEmbeddingModelFileInteraction())'''
tagged_training_documents = doc2vec_model.tag_documents(pd.DataFrame({'Description': X_train_str, 'Priority': y_train}))
tagged_testing_documents = doc2vec_model.tag_documents(pd.DataFrame({'Description': X_test_str, 'Priority': y_test}))
doc2vec_model.build_vocabulary(tagged_training_documents)
doc2vec_model.train_model(tagged_training_documents, dataset_shuffles=10, epochs=10)
#doc2vec_model.to_file("doc2vec_model.model", model_interaction.GensimWordEmbeddingModelFileInteraction())
#tagged_descriptions = doc2vec_model.tag_documents(X_test_str)
X_train = doc2vec_model.vectorize_documents(X_train_str)
X_test = doc2vec_model.vectorize_documents(X_test_str)
# Load Logistic Regression model
logreg_model = custom_models.classifiers.ML_classifiers.ITMultinomialLogisticRegression(cores_allocated=1)
'''logreg_model.use_preconfigured_model(
f'{DefaultConfig.absolute_project_root_path()}/custom_models/preconfigured_models/doc2vec_to_logreg_model.joblib',
SKLearnModelFileInteraction())'''
logreg_model.train_model(vectors=X_train, labels=y_train)
#logreg_model.save_model('doc2vec_to_logreg_model.joblib', SKLearnModelFileInteraction())
GensimWordEmbeddingModelFileInteraction())
train, test = doc2vec_model.split_texts(dataset)
tagged_training_documents = doc2vec_model.tag_documents(train)
tagged_testing_documents = doc2vec_model.tag_documents(test)
X_train = doc2vec_model.vectorize_tagged_documents(tagged_training_documents)
X_test = doc2vec_model.vectorize_tagged_documents(tagged_testing_documents)
X_test = np.asarray(X_test[1], dtype=np.float32)
encoder = LabelEncoder()
encoder.fit(['P5', 'P4', 'P3', 'P2', 'P1'])
y_train = encoder.transform(train['Priority'])
y_val = encoder.transform(test['Priority'])
y_train = to_categorical(y_train)
y_val = to_categorical(y_val)
# Load Pre-configured Keras CNN
CNN_model = KerasCNN()
CNN_model.from_file(f'{DefaultConfig.absolute_project_root_path()}/custom_models/preconfigured_models/CNN_model_deeper_doc2vec.h5',
KerasModelFileInteraction())
# vectorized_desc = TFIDF_Model.vectorize_description(self=TFIDF_model, description="WIFI network has lost connction across the whole campus, this needs fixing ASAP")
encoder = LabelEncoder().fit(['P5', 'P4', 'P3', 'P2', 'P1'])
'''y_train = to_categorical(encoder.transform(y_train))
y_val = to_categorical(encoder.transform(y_test))'''
# Make predictions
predictions = logreg_model.make_predictions(X_test)
encoded_predictions = CNN_model.make_predictions(X_test)
decoded_predictions = encoder.inverse_transform(encoded_predictions.argmax(axis=1))
# Represent accuracies
confusion_matrix = DetailedConfusionMatrix(predictions, y_test, ['P5', 'P4', 'P3', 'P2', 'P1'])
confusion_matrix = DetailedConfusionMatrix(decoded_predictions, test['Priority'], ['P5', 'P4', 'P3', 'P2', 'P1'])
confusion_matrix.plot_confusion_matrix(fullscreen_requested=True)
apc = AccuracyPerClass(predictions, y_test, ['P5', 'P4', 'P3', 'P2', 'P1'])
apc.plot_confusion_matrix()
print_evaluation_metrics(decoded_predictions, test['Priority'], ['P5', 'P4', 'P3', 'P2', 'P1'])
\ No newline at end of file
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical
import custom_models.classifiers.ML_classifiers
from custom_models.classifiers.DL_classifiers import KerasCNN
from custom_models.feature_selection_extraction.algorithmic_feature_extraction_selection import TFIDF_Model
from custom_models.feature_selection_extraction.ML_DL_feature_extraction_selection import ITSupportDoc2VecImplementation, Doc2VecModels
from project_utilities.evaluators import DetailedConfusionMatrix, AccuracyPerClass, print_evaluation_metrics
from project_utilities.model_interaction import SKLearnModelFileInteraction, KerasModelFileInteraction, GensimWordEmbeddingModelFileInteraction
from project_utilities import predictionformats
from project_utilities.my_datasets import ITSupportDatasetBuilder
from projectsettings import DefaultConfig
import pandas as pd
dataset = ITSupportDatasetBuilder(
f"{DefaultConfig.absolute_project_root_path()}/project_utilities/Datasets/ITSupport_Tickets.csv",
f"{DefaultConfig.absolute_project_root_path()}/project_utilities/Datasets/ITSupport_Tickets_High_Prio.csv",
f"{DefaultConfig.absolute_project_root_path()}/project_utilities/Datasets/synonym_IT_tickets.csv") \
.with_summaries_and_descriptions_combined() \
.with_overall_priority_column() \
.with_pre_processed_descriptions() \
.build().corpus
# Get pre-configured doc2vec model
doc2vec_model = ITSupportDoc2VecImplementation(Doc2VecModels.DBOW)
doc2vec_model.from_file(
f"{DefaultConfig.absolute_project_root_path()}/custom_models/preconfigured_models/doc2vec_model.model",
GensimWordEmbeddingModelFileInteraction())
train, test = doc2vec_model.split_texts(dataset)
tagged_training_documents = doc2vec_model.tag_documents(train)
tagged_testing_documents = doc2vec_model.tag_documents(test)
X_train = doc2vec_model.vectorize_tagged_documents(tagged_training_documents)
X_test = doc2vec_model.vectorize_tagged_documents(tagged_testing_documents)[1]
#tagged_training_documents = doc2vec_model.tag_documents(pd.DataFrame({'Description': X_train_str, 'Priority': y_train}))
#tagged_testing_documents = doc2vec_model.tag_documents(pd.DataFrame({'Description': X_test_str, 'Priority': y_test}))
#doc2vec_model.build_vocabulary(tagged_training_documents)
#doc2vec_model.train_model(tagged_training_documents, dataset_shuffles=10, epochs=10)
#doc2vec_model.to_file("doc2vec_model.model", GensimWordEmbeddingModelFileInteraction())
#tagged_descriptions = doc2vec_model.tag_documents(X_test_str)
# Load Logistic Regression model
model = custom_models.classifiers.ML_classifiers.ITMultinomialLogisticRegression(cores_allocated=1)
model.use_preconfigured_model(
f'{DefaultConfig.absolute_project_root_path()}/custom_models/preconfigured_models/tfidf_to_logreg_model_larger_doc2vec.joblib',
SKLearnModelFileInteraction())
#logreg_model.train_model(vectors=X_train, labels=y_train)
#logreg_model.save_model('doc2vec_to_logreg_model.joblib', SKLearnModelFileInteraction())
# Make predictions
predictions = model.make_predictions(X_test)
# Represent accuracies
confusion_matrix = DetailedConfusionMatrix(predictions, test['Priority'], ['P5', 'P4', 'P3', 'P2', 'P1'])
confusion_matrix.plot_confusion_matrix(fullscreen_requested=True)
print_evaluation_metrics(predictions, test['Priority'], ['P5', 'P4', 'P3', 'P2', 'P1'])
#apc = AccuracyPerClass(predictions,test['Priority'], ['P5', 'P4', 'P3', 'P2', 'P1'])
#apc.plot_confusion_matrix()
#------------------------------------------------------------------------------------------------------------
# Load Logistic Regression model
model = custom_models.classifiers.ML_classifiers.ITMultinomialNaiveBayes()
model.use_preconfigured_model(
f'{DefaultConfig.absolute_project_root_path()}/custom_models/preconfigured_models/tfidf_to_MNB_model_larger_doc2vec.joblib',
SKLearnModelFileInteraction())
#logreg_model.train_model(vectors=X_train, labels=y_train)
#logreg_model.save_model('doc2vec_to_logreg_model.joblib', SKLearnModelFileInteraction())
# Make predictions
predictions = model.make_predictions(X_test)
# Represent accuracies
confusion_matrix = DetailedConfusionMatrix(predictions, test['Priority'], ['P5', 'P4', 'P3', 'P2', 'P1'])
confusion_matrix.plot_confusion_matrix(fullscreen_requested=True)
print_evaluation_metrics(predictions, test['Priority'], ['P5', 'P4', 'P3', 'P2', 'P1'])
# Load Logistic Regression model
model = custom_models.classifiers.ML_classifiers.ITSupportVectorClassifier()
model.use_preconfigured_model(
f'{DefaultConfig.absolute_project_root_path()}/custom_models/preconfigured_models/tfidf_to_SVC_model_larger_doc2vec.joblib',
SKLearnModelFileInteraction())
#logreg_model.train_model(vectors=X_train, labels=y_train)
#logreg_model.save_model('doc2vec_to_logreg_model.joblib', SKLearnModelFileInteraction())
# Make predictions
predictions = model.make_predictions(X_test)
# Represent accuracies
confusion_matrix = DetailedConfusionMatrix(predictions, test['Priority'], ['P5', 'P4', 'P3', 'P2', 'P1'])
confusion_matrix.plot_confusion_matrix(fullscreen_requested=True)
print_evaluation_metrics(predictions, test['Priority'], ['P5', 'P4', 'P3', 'P2', 'P1'])
......@@ -2,11 +2,12 @@ from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical
from custom_models.classifiers.DL_classifiers import KerasCNN
from custom_models.feature_selection_extraction.algorithmic_feature_extraction_selection import TFIDF_Model
from project_utilities.evaluators import DetailedConfusionMatrix, AccuracyPerClass
from project_utilities.evaluators import DetailedConfusionMatrix, AccuracyPerClass, print_evaluation_metrics
from project_utilities.model_interaction import SKLearnModelFileInteraction, KerasModelFileInteraction
from project_utilities import predictionformats
from project_utilities.my_datasets import ITSupportDatasetBuilder
from projectsettings import DefaultConfig
from sklearn.metrics import classification_report, accuracy_score, balanced_accuracy_score
import pandas as pd
......@@ -63,3 +64,5 @@ prediction_saver = predictionformats.ITSupportPredictionFormat()
prediction_saver.load_predictions(formatted_predictions)
filename = input("Enter filename: ")
prediction_saver.save_predictions_to_file(filename, 'csv')'''
print_evaluation_metrics(decoded_predictions, y_test, ['P5', 'P4', 'P3', 'P2', 'P1'])
\ No newline at end of file
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical
from custom_models.classifiers.DL_classifiers import KerasCNN
from custom_models.classifiers.ML_classifiers import ITMultinomialLogisticRegression
from custom_models.classifiers.ML_classifiers import *
from custom_models.feature_selection_extraction.algorithmic_feature_extraction_selection import TFIDF_Model
from project_utilities.evaluators import DetailedConfusionMatrix, AccuracyPerClass
from project_utilities.evaluators import DetailedConfusionMatrix, AccuracyPerClass, print_evaluation_metrics
from project_utilities.model_interaction import SKLearnModelFileInteraction, KerasModelFileInteraction
from project_utilities import predictionformats
from project_utilities.my_datasets import ITSupportDatasetBuilder
......@@ -21,47 +21,69 @@ dataset = ITSupportDatasetBuilder(
.with_pre_processed_descriptions() \
.build().corpus
# Load Pre-configured TF-IDF
# Load Pre-configured TF-IDF for logreg
TFIDF_model = TFIDF_Model()
TFIDF_model.from_file(
f'{DefaultConfig.absolute_project_root_path()}/custom_models/preconfigured_models/tfidf_larger_model.joblib',
f'{DefaultConfig.absolute_project_root_path()}/custom_models/preconfigured_models/tfidf_model.joblib',
SKLearnModelFileInteraction())
# Split dataset into test and train
X_train_str, X_test_str, y_train, y_test = TFIDF_model.split_dataset(0.1, dataset['Description'].tolist(),
dataset['Priority'].tolist())
X_test = TFIDF_model.vectorize_descriptions(X_test_str)
# Load Logistic Regression model
logreg_model = ITMultinomialLogisticRegression(cores_allocated=1)
logreg_model.use_preconfigured_model(
f'{DefaultConfig.absolute_project_root_path()}/custom_models/preconfigured_models/tfidf_to_logreg_model_larger.joblib',
f'{DefaultConfig.absolute_project_root_path()}/custom_models/preconfigured_models/tfidf_to_logreg_model.joblib',
SKLearnModelFileInteraction())
# Split dataset into test and train
X_train_str, X_test_str, y_train, y_test = TFIDF_model.split_dataset(0.1, dataset['Description'].tolist(),
dataset['Priority'].tolist())
predictions = logreg_model.make_predictions(X_test)
# Represent accuracies
confusion_matrix = DetailedConfusionMatrix(predictions, y_test, ['P5', 'P4', 'P3', 'P2', 'P1'])
confusion_matrix.plot_confusion_matrix(fullscreen_requested=True)
print_evaluation_metrics(predictions, y_test, ['P5', 'P4', 'P3', 'P2', 'P1'])
# Load Pre-configured TF-IDF
TFIDF_model = TFIDF_Model()
TFIDF_model.from_file(
f'{DefaultConfig.absolute_project_root_path()}/custom_models/preconfigured_models/tfidf_larger_model.joblib',
SKLearnModelFileInteraction())
# Convert the Descriptions to Sparse Matrices, representative of text
X_test = TFIDF_model.vectorize_descriptions(X_test_str)
#X_train = TFIDF_model.vectorize_descriptions(X_train_str)
#logreg_model.train_model(X_train, y_train)
# vectorized_desc = TFIDF_Model.vectorize_description(self=TFIDF_model, description="WIFI network has lost connction across the whole campus, this needs fixing ASAP")
#encoder = LabelEncoder().fit(['P5', 'P4', 'P3', 'P2', 'P1'])
'''y_train = to_categorical(encoder.transform(y_train))
y_val = to_categorical(encoder.transform(y_test))'''
# Make predictions
# Load Logistic Regression model
logreg_model = ITMultinomialNaiveBayes()
logreg_model.use_preconfigured_model(
f'{DefaultConfig.absolute_project_root_path()}/custom_models/preconfigured_models/tfidf_to_MNB_model_larger.joblib',
SKLearnModelFileInteraction())
predictions = logreg_model.make_predictions(X_test)
#decoded_predictions = encoder.inverse_transform(encoded_predictions.argmax(axis=1))
# Represent accuracies
confusion_matrix = DetailedConfusionMatrix(predictions, y_test, ['P5', 'P4', 'P3', 'P2', 'P1'])
confusion_matrix.plot_confusion_matrix(fullscreen_requested=True)
print_evaluation_metrics(predictions, y_test, ['P5', 'P4', 'P3', 'P2', 'P1'])
# Load Logistic Regression model
logreg_model = ITSupportVectorClassifier()
logreg_model.use_preconfigured_model(
f'{DefaultConfig.absolute_project_root_path()}/custom_models/preconfigured_models/tfidf_to_SVC_model_larger.joblib',
SKLearnModelFileInteraction())
predictions = logreg_model.make_predictions(X_test)
# Represent accuracies
confusion_matrix = DetailedConfusionMatrix(predictions, y_test, ['P5', 'P4', 'P3', 'P2', 'P1'])
confusion_matrix.plot_confusion_matrix(fullscreen_requested=True)
print_evaluation_metrics(predictions, y_test, ['P5', 'P4', 'P3', 'P2', 'P1'])
apc = AccuracyPerClass(predictions, y_test, ['P5', 'P4', 'P3', 'P2', 'P1'])
apc.plot_confusion_matrix()
# export predictions to file
'''dict_descriptions_predictions = {'Description': X_test_str, 'PredictedPriority': decoded_predictions}
formatted_predictions = pd.DataFrame(dict_descriptions_predictions)
prediction_saver = predictionformats.ITSupportPredictionFormat()
prediction_saver.load_predictions(formatted_predictions)
filename = input("Enter filename: ")
prediction_saver.save_predictions_to_file(filename, 'csv')'''
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from project_utilities.ModelTemplates import SKLearnMachineLearningModel
......@@ -7,6 +7,8 @@ from custom_models.feature_selection_extraction.algorithmic_feature_extraction_s
from projectsettings import DefaultConfig
from project_utilities.model_interaction import SKLearnModelFileInteraction
from project_utilities import my_datasets
# IT implementation of Multinomial Logistic Regression
class ITMultinomialLogisticRegression(SKLearnMachineLearningModel):
def __init__(self, inverse_regularisation_strength: float = 1e5, cores_allocated: int = 1):
super().__init__(LogisticRegression(n_jobs=cores_allocated,
......@@ -16,17 +18,17 @@ class ITMultinomialLogisticRegression(SKLearnMachineLearningModel):
verbose=1,
max_iter=10000))
# IT implementation of Multinomial / Guassian Naive Bayes
class ITMultinomialNaiveBayes(SKLearnMachineLearningModel):
def __init__(self):
super().__init__(MultinomialNB())
super().__init__(MultinomialNB())#GaussianNB())
# IT implementation of Support Vector Machines
class ITSupportVectorClassifier(SKLearnMachineLearningModel):
def __init__(self):
super().__init__(LinearSVC())
# IT implementation of Random Forest Classifier (not used)
class ITRandomForestClassifier(SKLearnMachineLearningModel):
def __init__(self, tree_quantity: int = 200, max_tree_depth: int = 10, randomness: int = 1):
super().__init__(RandomForestClassifier(n_estimators=tree_quantity, max_depth=max_tree_depth, random_state=randomness))
......
import multiprocessing
import time
from enum import Enum
import gensim.models
import gensim.models.doc2vec
from gensim.test.test_doc2vec import ConcatenatedDoc2Vec
from pandas import DataFrame
from sklearn import utils
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from project_utilities import model_interaction
from project_utilities.ModelTemplates import GensimWordEmbeddingModel
from gensim.models.doc2vec import Doc2Vec
# from project_utilities import preprocessing_functionality, my_datasets
from project_utilities import my_datasets
import preprocessing_functionality
from projectsettings import DefaultConfig
from project_utilities.model_interaction import SKLearnModelFileInteraction
from project_utilities.my_datasets import ITSupportDatasetBuilder
import custom_models.classifiers.ML_classifiers
from project_utilities.evaluators import DetailedConfusionMatrix, AccuracyPerClass
class Doc2VecModels(Enum):
DBOW = 1
......@@ -32,20 +30,22 @@ class ITSupportDoc2VecImplementation(GensimWordEmbeddingModel):
tqdm.pandas(desc="progress-bar")
super().__init__(model)
# Split text into test / train, same split and random state as TF-IDF, just implemented twice :)
@staticmethod
def split_texts(dataset):
training_data, testing_data = train_test_split(dataset, test_size=0.1, random_state=1000)
return training_data, testing_data
# Convert to text to TaggedDocuments for model training
def tag_documents(self, documents) -> DataFrame:
tagged_documents = documents.apply(
lambda docs: gensim.models.doc2vec.TaggedDocument(
words=preprocessing_functionality.tokenize_text(docs.Description),
tags=[docs.Priority]),
axis=1)
return tagged_documents
# Choose which model to implement
def create_model(self):
cores = multiprocessing.cpu_count()
match self.model_type:
......@@ -56,60 +56,94 @@ class ITSupportDoc2VecImplementation(GensimWordEmbeddingModel):
case _:
raise TypeError("Must be a Doc2Vec model type (DBOW, DM, COMBINED)")
# Create the DBOW variant
def _create_dbow_model(self, cores):
model = Doc2Vec(
dm=0, vector_size=1000, negative=5, hs=0, min_count=2, sample=0, workers=cores)
self.alpha_change = 0.0002
return model
# Create the DM variant
def _create_dm_model(self, cores):
model = Doc2Vec(
dm=1, dm_mean=1, vector_size=300, window=10, negative=5,
dm=1, dm_mean=1, vector_size=1000, window=10, negative=5,
min_count=1, workers=cores, alpha=0.065, min_alpha=0.065)
self.alpha_change = -0.002
return model
# Build model vocabulary
def build_vocabulary(self, tagged_training_documents):
vocabulary = [x for x in tqdm(tagged_training_documents.values)]
self.model.build_vocab(vocabulary)
# Train the model
def train_model(self, tagged_training_documents, dataset_shuffles: int = 1, epochs: int = 1):
for training_round in range(dataset_shuffles):
# shuffle training data
shuffled_training_data = utils.shuffle([x for x in tqdm(tagged_training_documents.values)])
dataset_size = len(tagged_training_documents)
self.model.train(shuffled_training_data, total_examples=dataset_size,epochs=epochs)
self.model.train(shuffled_training_data, total_examples=dataset_size, epochs=epochs)
self.model.alpha += self.alpha_change
self.model.min_alpha = self.model.alpha
# @numba.jit(forceobj=True)
# Turn tagged docs to word embeddings & associated classifications
def vectorize_tagged_documents(self, tagged_documents):
sentences = tagged_documents.values
targets, regressors = zip(*[(doc.tags[0], self.model.infer_vector(doc.words)) for doc in sentences])
return targets, regressors
# Get word embeddings for training purposes
def generate_training_vectors(self, tagged_documents):
labels, descriptions = self.vectorize_tagged_documents(tagged_documents)
return labels, descriptions
# Only get word embeddings
def vectorize_documents(self, documents):
documents = [document.split(' ') for document in documents]
return [self.model.infer_vector(document) for document in documents]
if __name__ == '__main__':
dataset = my_datasets.ITSupportDatasetBuilder()\
.with_overall_priority_column()\
.with_summaries_and_descriptions_combined()\
.with_pre_processed_descriptions()\
# Load the datasets and preprocess them
dataset = ITSupportDatasetBuilder(
f"{DefaultConfig.absolute_project_root_path()}/project_utilities/Datasets/ITSupport_Tickets.csv",
f"{DefaultConfig.absolute_project_root_path()}/project_utilities/Datasets/ITSupport_Tickets_High_Prio.csv") \
.with_summaries_and_descriptions_combined() \
.with_overall_priority_column() \
.with_pre_processed_descriptions() \
.build().corpus
# Create Doc2Vec object
doc2vec_IT = ITSupportDoc2VecImplementation(model_type=Doc2VecModels.DBOW, alpha_change=-0.002)
# Split the dataset
training_documents, testing_documents = doc2vec_IT.split_texts(dataset)
tagged_training_documents = doc2vec_IT.tag_documents(training_documents)
tagged_testing_documents = doc2vec_IT.tag_documents(testing_documents)
# build vocabulary then train the model
doc2vec_IT.build_vocabulary(tagged_training_documents)
doc2vec_IT.train_model(tagged_training_documents, dataset_shuffles=3, epochs=10)
doc2vec_IT.to_file("doc2vec_model.model", model_interaction.GensimWordEmbeddingModelFileInteraction())
#doc2vec_IT.generate_vectors()
#print(doc2vec_IT.X_test)
doc2vec_IT.train_model(tagged_training_documents, dataset_shuffles=15, epochs=5)
# Save to file
#doc2vec_IT.to_file("doc2vec_model_15shuffles_15epochs_DBOW.model", model_interaction.GensimWordEmbeddingModelFileInteraction())
#doc2vec_IT.from_file("C:\\Users\\Benjamin\\PycharmProjects\\DISSERTATION_ARTEFACT\\custom_models\\feature_selection_extraction\\doc2vec_model_15shuffles_15epochs_DBOW.model", model_interaction.GensimWordEmbeddingModelFileInteraction())
X_train = doc2vec_IT.vectorize_tagged_documents(tagged_training_documents)
y_train = training_documents['Priority']
X_test = doc2vec_IT.vectorize_tagged_documents(tagged_testing_documents)
y_test = testing_documents['Priority']
print(X_train[0], X_train[1], end="!!!!!!!!!!!!!!!!!!!!!")
logreg_model = custom_models.classifiers.ML_classifiers.ITMultinomialLogisticRegression(cores_allocated=1)
logreg_model.use_preconfigured_model(f"{DefaultConfig.absolute_project_root_path()}/custom_models/feature_selection_extraction/doc2vec_to_logreg_model_larger_DBOW.joblib", SKLearnModelFileInteraction())
#logreg_model.train_model(X_train[1], y_train)
#logreg_model.save_model('doc2vec_to_logreg_model_larger_DBOW.joblib', SKLearnModelFileInteraction())
# Make predictions
predictions = logreg_model.make_predictions(X_test[1])
# Represent accuracies
confusion_matrix = DetailedConfusionMatrix(predictions, y_test, ['P5', 'P4', 'P3', 'P2', 'P1'])
confusion_matrix.plot_confusion_matrix(fullscreen_requested=True)
apc = AccuracyPerClass(predictions, y_test, ['P5', 'P4', 'P3', 'P2', 'P1'])
apc.plot_confusion_matrix()
# doc2vec_IT.generate_vectors()
# print(doc2vec_IT.X_test)
import numpy
from tqdm import tqdm
import projectsettings
from project_utilities import my_datasets, preprocessing_functionality
from project_utilities import my_datasets
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from project_utilities import evaluators
import pandas
import numba
#from custom_models.classifiers import ML_classifiers
import joblib
from project_utilities import model_interaction
from projectsettings import DefaultConfig
@numba.jit(forceobj=1)
def preprocess_corpus(corpus: pandas.DataFrame, *columns):
for column in columns:
corpus[column] = corpus[column].apply(preprocessing_functionality.clean_text)
return corpus
class TFIDF_Model:
vectorizer = TfidfVectorizer
......@@ -54,54 +37,8 @@ class TFIDF_Model:
return train_test_split(X, y, test_size=percentage_testing, random_state=1000)
'''def Main():
# Get Dataset
dataset = my_datasets.ITSupportDatasetBuilder() \
.with_summaries_and_descriptions_combined() \
.with_overall_priority_column() \
.with_pre_processed_descriptions() \
.build().corpus
tfidf = ITSupportTFIDFImplementation()
tfidf.fit_to_corpus(dataset['Description'].tolist())
vectorised_descriptions = tfidf.vectorize_descriptions(dataset['Description'].tolist())
# print(tfidf.vectorized_descriptions[0].shape)
X_train, X_test, y_train, y_test = tfidf.split_dataset(0.1, vectorised_descriptions, dataset['Priority'].tolist())
logreg = ML_classifiers.ITMultinomialLogisticRegression()
logreg.use_preconfigured_model('tfidf_to_logreg_model.joblib', model_interaction.SKLearnModelFileInteraction())
# logreg.use_preconfigured_model('tfidf_to_logreg_model.joblib', model_interaction.SKLearnModelFileInteraction())
print('Training Model')
logreg.train_model()
joblib.dump(logreg, "tfidf_to_logreg_model.joblib")
print("finished!")
# print(X_train, X_test)
# logreg.train_model(X_train, y_train)
# logreg.save_model('tfidf_to_logreg_model.joblib', model_interaction.SKLearnModelFileInteraction())
label_predictions = logreg.make_predictions(X_test)
# print('Made Predictions') #classification_report(tfidf.testing_labels, label_predictions))
labels = ['P5', 'P4', 'P3', 'P2', 'P1']
from sklearn import metrics
# print(metrics.classification_report(y_test, label_predictions))
cm = evaluators.ITSupportPriorityConfusionMatrixEvaluator(label_predictions, y_test, labels)
cm.plot_confusion_matrix(fullscreen_requested=True)
# user_issue = input("Enter ticket desc: ")'''
if __name__ == '__main__':
# Main()
# Get Dataset
'''dataset = my_datasets.ITSupportDatasetBuilder() \
.with_summaries_and_descriptions_combined() \
.with_overall_priority_column() \
.with_pre_processed_descriptions() \
.build().corpus
tfidf = ITSupportTFIDFImplementation()
tfidf.fit_to_corpus(dataset['Description'].tolist())
tfidf.to_file('tfidf_model.joblib')'''
# Get Dataset
dataset = my_datasets.ITSupportDatasetBuilder(
f"{DefaultConfig.absolute_project_root_path()}/project_utilities/Datasets/ITSupport_Tickets.csv",
......
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv1D, MaxPooling1D
from custom_models.feature_selection_extraction.algorithmic_feature_extraction_selection import TFIDF_Model
from project_utilities.model_interaction import SKLearnModelFileInteraction
from projectsettings import DefaultConfig
from project_utilities.my_datasets import ITSupportDatasetBuilder
from keras.callbacks import EarlyStopping
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical
import numpy as np
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical
import custom_models.classifiers.ML_classifiers
from custom_models.classifiers.DL_classifiers import KerasCNN
from custom_models.feature_selection_extraction.algorithmic_feature_extraction_selection import TFIDF_Model
from custom_models.feature_selection_extraction.ML_DL_feature_extraction_selection import ITSupportDoc2VecImplementation, Doc2VecModels
from project_utilities.evaluators import DetailedConfusionMatrix, AccuracyPerClass
from project_utilities.model_interaction import SKLearnModelFileInteraction, KerasModelFileInteraction, GensimWordEmbeddingModelFileInteraction
from project_utilities import predictionformats
from project_utilities.my_datasets import ITSupportDatasetBuilder
from projectsettings import DefaultConfig
# Load Dataset
dataset = ITSupportDatasetBuilder(
f"{DefaultConfig.absolute_project_root_path()}/project_utilities/Datasets/ITSupport_Tickets.csv",
f"{DefaultConfig.absolute_project_root_path()}/project_utilities/Datasets/ITSupport_Tickets_High_Prio.csv",
f"{DefaultConfig.absolute_project_root_path()}/project_utilities/Datasets/synonym_IT_tickets.csv") \
.with_summaries_and_descriptions_combined() \
.with_overall_priority_column() \
.with_pre_processed_descriptions() \
.build().corpus
'''# Load Pre-configured TF-IDF
TFIDF_model = TFIDF_Model()
TFIDF_model.from_file(
f'{DefaultConfig.absolute_project_root_path()}/custom_models/preconfigured_models/tfidf_larger_model.joblib',
SKLearnModelFileInteraction())
# Split dataset into test and train
X_train_str, X_test_str, y_train, y_test = TFIDF_model.split_dataset(0.2, dataset['Description'].tolist(),
dataset['Priority'].tolist())
X_test = TFIDF_model.vectorize_descriptions(X_test_str)
X_test = np.expand_dims(X_test, axis=-1)
X_train = TFIDF_model.vectorize_descriptions(X_train_str)
X_train = np.expand_dims(X_train, axis=-1)
# Encode class labels
encoder = LabelEncoder()
encoder.fit(['P5', 'P4', 'P3', 'P2', 'P1'])
y_train = encoder.transform(y_train)
y_val = encoder.transform(y_test)
y_train = to_categorical(y_train)
y_val = to_categorical(y_val)'''
# Get pre-configured doc2vec model
doc2vec_model = ITSupportDoc2VecImplementation(Doc2VecModels.DBOW)
doc2vec_model.from_file(
f"{DefaultConfig.absolute_project_root_path()}/custom_models/preconfigured_models/doc2vec_model.model",
GensimWordEmbeddingModelFileInteraction())
train, test = doc2vec_model.split_texts(dataset)
tagged_training_documents = doc2vec_model.tag_documents(train)
tagged_testing_documents = doc2vec_model.tag_documents(test)
X_train = doc2vec_model.vectorize_tagged_documents(tagged_training_documents)
X_test = doc2vec_model.vectorize_tagged_documents(tagged_testing_documents)
# Adapted x train shape from Bing AI with prompt: "In Python, generate a script that feeds a Gensim Doc2Vec word embedding into a Keras CNN"
X_train_shape = doc2vec_model.model.wv.vectors
X_train_shape = X_train_shape.reshape((X_train_shape.shape[0], X_train_shape.shape[1], 1))
print((X_train_shape.shape[1], 1))
# Encode class labels
encoder = LabelEncoder()
encoder.fit(['P5', 'P4', 'P3', 'P2', 'P1'])
y_train = encoder.transform(train['Priority'])
y_val = encoder.transform(test['Priority'])
y_train = to_categorical(y_train)
y_val = to_categorical(y_val)
#X_test = np.expand_dims(X_test, axis=-1)
#X_train = np.expand_dims(X_train, axis=-1)
X_train = np.asarray(X_train[1], dtype=np.float32)
X_test = np.asarray(X_test[1], dtype=np.float32)
num_classes = 5
input_shape = (1, 1000)
model = Sequential()
model.add(Conv1D(32, kernel_size=3, activation='relu', input_shape=(X_train_shape.shape[1], 1)))
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# Define early stopping callback
early_stopping = EarlyStopping(monitor='val_loss', patience=8)
model.fit(X_train, y_train, epochs=50, batch_size=50, validation_data=(X_test, y_val), callbacks=[early_stopping])
model.save('CNN_model_deeper_doc2vec.h5')
\ No newline at end of file
import multiprocessing
import time
from enum import Enum
import gensim.models
import gensim.models.doc2vec
from gensim.test.test_doc2vec import ConcatenatedDoc2Vec
from pandas import DataFrame
from sklearn import utils
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import custom_models.feature_selection_extraction.ML_DL_feature_extraction_selection
from project_utilities import model_interaction
from project_utilities.ModelTemplates import GensimWordEmbeddingModel
from gensim.models.doc2vec import Doc2Vec
# from project_utilities import preprocessing_functionality, my_datasets
from project_utilities import my_datasets
import preprocessing_functionality
from projectsettings import DefaultConfig
from project_utilities.model_interaction import SKLearnModelFileInteraction, KerasModelFileInteraction, \
GensimWordEmbeddingModelFileInteraction
from project_utilities.my_datasets import ITSupportDatasetBuilder
import custom_models.classifiers.ML_classifiers
from project_utilities.evaluators import DetailedConfusionMatrix, AccuracyPerClass
# Load the datasets and preprocess them
dataset = ITSupportDatasetBuilder(
f"{DefaultConfig.absolute_project_root_path()}/project_utilities/Datasets/ITSupport_Tickets.csv",
f"{DefaultConfig.absolute_project_root_path()}/project_utilities/Datasets/ITSupport_Tickets_High_Prio.csv") \
.with_summaries_and_descriptions_combined() \
.with_overall_priority_column() \
.with_pre_processed_descriptions() \
.build().corpus
# Create Doc2Vec object
doc2vec_IT = custom_models.feature_selection_extraction.ML_DL_feature_extraction_selection.ITSupportDoc2VecImplementation(model_type=custom_models.feature_selection_extraction.ML_DL_feature_extraction_selection.Doc2VecModels.DM, alpha_change=-0.02)
# Split the dataset
training_documents, testing_documents = doc2vec_IT.split_texts(dataset)
tagged_training_documents = doc2vec_IT.tag_documents(training_documents)
tagged_testing_documents = doc2vec_IT.tag_documents(testing_documents)
# build vocabulary then train the model
doc2vec_IT.build_vocabulary(tagged_training_documents)
doc2vec_IT.train_model(tagged_training_documents, dataset_shuffles=15, epochs=5)
# Save to file
#doc2vec_IT.to_file("doc2vec_model_15shuffles_15epochs_DBOW.model", model_interaction.GensimWordEmbeddingModelFileInteraction())
#doc2vec_IT.from_file("C:\\Users\\Benjamin\\PycharmProjects\\DISSERTATION_ARTEFACT\\custom_models\\feature_selection_extraction\\doc2vec_model_15shuffles_15epochs_DBOW.model", model_interaction.GensimWordEmbeddingModelFileInteraction())
X_train = doc2vec_IT.vectorize_tagged_documents(tagged_training_documents)
y_train = training_documents['Priority']
X_test = doc2vec_IT.vectorize_tagged_documents(tagged_testing_documents)
y_test = testing_documents['Priority']
print(X_train[0], X_train[1], end="!!!!!!!!!!!!!!!!!!!!!")
logreg_model = custom_models.classifiers.ML_classifiers.ITMultinomialLogisticRegression(cores_allocated=1)
#logreg_model.use_preconfigured_model(f"{DefaultConfig.absolute_project_root_path()}/custom_models/feature_selection_extraction/doc2vec_to_logreg_model_larger_DBOW.joblib", SKLearnModelFileInteraction())
logreg_model.train_model(X_train[1], y_train)
#logreg_model.save_model('doc2vec_to_logreg_model_larger_DBOW.joblib', SKLearnModelFileInteraction())
# Make predictions
predictions = logreg_model.make_predictions(X_test[1])
# Represent accuracies
confusion_matrix = DetailedConfusionMatrix(predictions, y_test, ['P5', 'P4', 'P3', 'P2', 'P1'])
confusion_matrix.plot_confusion_matrix(fullscreen_requested=True)
apc = AccuracyPerClass(predictions, y_test, ['P5', 'P4', 'P3', 'P2', 'P1'])
apc.plot_confusion_matrix()
# doc2vec_IT.generate_vectors()
# print(doc2vec_IT.X_test)
\ No newline at end of file
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical
from custom_models.classifiers.DL_classifiers import KerasCNN
from custom_models.classifiers.ML_classifiers import *
from custom_models.feature_selection_extraction.algorithmic_feature_extraction_selection import TFIDF_Model
from project_utilities.evaluators import DetailedConfusionMatrix, AccuracyPerClass
from project_utilities.model_interaction import SKLearnModelFileInteraction, KerasModelFileInteraction, \
GensimWordEmbeddingModelFileInteraction
from project_utilities import predictionformats
from project_utilities.my_datasets import ITSupportDatasetBuilder
from projectsettings import DefaultConfig
from custom_models.feature_selection_extraction.ML_DL_feature_extraction_selection import ITSupportDoc2VecImplementation, Doc2VecModels
# Load Dataset
dataset = ITSupportDatasetBuilder(
f"{DefaultConfig.absolute_project_root_path()}/project_utilities/Datasets/ITSupport_Tickets.csv",
f"{DefaultConfig.absolute_project_root_path()}/project_utilities/Datasets/ITSupport_Tickets_High_Prio.csv",
f"{DefaultConfig.absolute_project_root_path()}/project_utilities/Datasets/synonym_IT_tickets.csv") \
.with_summaries_and_descriptions_combined() \
.with_overall_priority_column() \
.with_pre_processed_descriptions() \
.build().corpus
'''# Load Pre-configured TF-IDF
TFIDF_model = TFIDF_Model()
TFIDF_model.from_file(
f'{DefaultConfig.absolute_project_root_path()}/custom_models/preconfigured_models/tfidf_larger_model.joblib',
SKLearnModelFileInteraction())
# Split dataset into test and train
X_train_str, X_test_str, y_train, y_test = TFIDF_model.split_dataset(0.1, dataset['Description'].tolist(),
dataset['Priority'].tolist())'''
# Get pre-configured doc2vec model
doc2vec_model = ITSupportDoc2VecImplementation(Doc2VecModels.DBOW)
doc2vec_model.from_file(
f"{DefaultConfig.absolute_project_root_path()}/custom_models/preconfigured_models/doc2vec_model.model",
GensimWordEmbeddingModelFileInteraction())
train, test = doc2vec_model.split_texts(dataset)
# Split dataset into test and train
tagged_training_documents = doc2vec_model.tag_documents(train)
tagged_testing_documents = doc2vec_model.tag_documents(test)
# Get word embeddings
X_train = doc2vec_model.vectorize_tagged_documents(tagged_training_documents)
y_train = train['Priority']
X_test = doc2vec_model.vectorize_tagged_documents(tagged_testing_documents)
y_test = test['Priority']
# Load all models
print("Load")
logreg_model = ITMultinomialLogisticRegression()
MNB_model = ITMultinomialNaiveBayes()
SVC_model = ITSupportVectorClassifier()
# Train all models of TF-IDF
print("Train")
logreg_model.train_model(X_train[1], y_train)
MNB_model.train_model(X_train[1], y_train)
SVC_model.train_model(X_train[1], y_train)
# Save models
print("Save")
logreg_model.save_model("tfidf_to_logreg_model_larger_doc2vec.joblib", SKLearnModelFileInteraction())
MNB_model.save_model("tfidf_to_MNB_model_larger_doc2vec.joblib", SKLearnModelFileInteraction())
SVC_model.save_model("tfidf_to_SVC_model_larger_doc2vec.joblib", SKLearnModelFileInteraction())
......@@ -34,22 +34,28 @@ class KerasDeepLearningModel(ABC):
def __init__(self, model=None):
self.model = model
# Load model from file
def from_file(self, filename, model_loader: model_interaction.FileInteraction):
self.model = model_loader.load_from_file(filename)
# Load model to file
def to_file(self, filename, model_loader: model_interaction.FileInteraction):
model_loader.load_to_file(self.model, filename)
# Add neural network layer
def add_model_config(self, layer):
self.model.add(layer)
# Compile model prior to training
def compile_model(self, loss_function, optimizer, *metrics):
self.model.compile(loss=loss_function, metrics=[*metrics, ], optimizer=optimizer)
# Train model
@abstractmethod
def train_model(self, vectors, labels, test_vectors, test_labels, epochs, batch_size):
pass
# Given vectors, make predictions
@abstractmethod
def make_predictions(self, vectors):
pass
......@@ -61,9 +67,10 @@ class GensimWordEmbeddingModel(ABC):
def __init__(self, model=None):
self.model = model
# Load model from file
def from_file(self, filename, model_loader: model_interaction.GensimWordEmbeddingModelFileInteraction):
self.model = model_loader.load_from_file(filename)
print(self.model)
# Save model to file
def to_file(self, filename, model_loader: model_interaction.GensimWordEmbeddingModelFileInteraction):
model_loader.load_to_file(self.model, filename)
......@@ -4,7 +4,7 @@ import matplotlib.pyplot as plt
from pandas import DataFrame
from numpy import sum as numpy_sum, ndarray, empty_like
from dataclasses import dataclass
from sklearn.metrics import classification_report, balanced_accuracy_score, accuracy_score
global CURRENT_FIGURES
......@@ -25,49 +25,69 @@ class DetailedConfusionMatrix:
self.actual_values = actual_values
self.labels = labels
print(self.labels)
# Generate text-based confusion matrix
self.dataset_confusion_matrix = confusion_matrix(self.actual_values, self.predictions, labels=self.labels)
self.dataset_annotations = empty_like(self.dataset_confusion_matrix).astype(str)
# Get total number of items per class
self.confusion_matrix_sums = numpy_sum(self.dataset_confusion_matrix, axis=1, keepdims=True)
# Get the proportion of each class predicted compared to the total number of actual tickets of the class
self.confusion_matrix_percentages = self.dataset_confusion_matrix / self.confusion_matrix_sums.astype(
float) * 100
def plot_confusion_matrix(self, fullscreen_requested: bool = False):
# Update call annotations
self.__update_dataset_annotations()
# Convert confusion matrix to DataFrame
dataset_confusion_matrix_data_frame = DataFrame(self.dataset_confusion_matrix,
index=self.labels,
columns=self.labels)
# Add labels to X and Y
dataset_confusion_matrix_data_frame.index.name = 'Actual'
dataset_confusion_matrix_data_frame.columns.name = 'Predicted'
label_quantity = len(self.labels)
# Add sub-labels (the classes) to X and Y
fig, ax = plt.subplots(figsize=(label_quantity, label_quantity))
# Normalise values to enable heatmap visualisation
# Adapted from https://stackoverflow.com/questions/42111075/seaborn-heatmap-color-scheme-based-on-row-values
normalised_confusion_matrix = dataset_confusion_matrix_data_frame.div(
dataset_confusion_matrix_data_frame.max(axis=1), axis=0)
heatmap(normalised_confusion_matrix, cmap="YlGnBu", annot=self.dataset_annotations, fmt='', ax=ax)
# Show heatmap visualisation fullscreen if requested
# Adapted from https://stackoverflow.com/questions/12439588/how-to-maximize-a-plt-show-window-using-python
# (dinvlad)
if fullscreen_requested:
fig_manager = plt.get_current_fig_manager()
fig_manager.window.state('zoomed')
# Render the heatmap
plt.show()
def __update_dataset_annotations(self):
# Get riw & column quantity
n_rows, n_columns = self.dataset_confusion_matrix.shape
# For each row in each column (every cell) call the alter_annotation method
[self.alter_annotation(row, column) for row in range(n_rows) for column in range(n_columns)]
def alter_annotation(self, row: int, column: int):
# Get number of predictions for cell
cell_predicted_count = self.dataset_confusion_matrix[row, column]
# Get percentage of total class predictions for the cell
cell_percentage_of_category = self.confusion_matrix_percentages[row, column]
# Get total tickets for the predicted class
category_count = self.confusion_matrix_sums[row]
# Add prediction quantity, percentage, total to cell if quantity above 0,
# if not just add zeros to cell
if row == column or cell_predicted_count != 0:
self.dataset_annotations[row, column] = '%.1f%%\n%d/%d' % (
cell_percentage_of_category, cell_predicted_count, category_count)
else:
self.dataset_annotations[row, column] = '%d%%\n%d/%d' % (0, 0, category_count)
# Unused, shows the recall per provided classification
@dataclass
class AccuracyPerClass:
label_predictions: list
......@@ -96,6 +116,7 @@ class AccuracyPerClass:
normalised_correct_predictions = [correct[label] / total_predictions_per_label[label] for label in range(len(correct))]
normalised_incorrect_predictions = [incorrect[label] / total_predictions_per_label[label] for label in range(len(incorrect))]
return normalised_correct_predictions, normalised_incorrect_predictions
def plot_confusion_matrix(self):
"""
Adapted from firstly phind prompts:
......@@ -121,3 +142,12 @@ class AccuracyPerClass:
ax.set_ylabel('Correct - Incorrect Proportion')
ax.legend()
plt.show()
def print_evaluation_metrics(predictions, actual, labels):
# Print Precision, Recall per class
print(classification_report(actual, predictions, labels=labels))
# Print Overall Accuracy
print("Accuracy: ", accuracy_score(actual, predictions))
# Print Balanced Accuracy
print("Balanced Accuracy: ", balanced_accuracy_score(actual, predictions))
......@@ -3,7 +3,7 @@ import joblib
from keras.models import load_model, save_model
from gensim.models.doc2vec import Doc2Vec
# Abstract base class for saving and loading models from files
class FileInteraction(ABC):
@staticmethod
@abstractmethod
......@@ -15,7 +15,7 @@ class FileInteraction(ABC):
def load_to_file(model, filename):
pass
# Sci-Kit Learn implementation of FileInteraction
class SKLearnModelFileInteraction(FileInteraction):
@staticmethod
......@@ -26,7 +26,7 @@ class SKLearnModelFileInteraction(FileInteraction):
def load_to_file(model, filename):
joblib.dump(model, filename)
# Keras implementation of FileInteraction
class KerasModelFileInteraction(FileInteraction):
@staticmethod
def load_from_file(filename):
......@@ -36,7 +36,7 @@ class KerasModelFileInteraction(FileInteraction):
def load_to_file(model, filename):
save_model(model, filename)
# Gensim implementation of FileInteraction
class GensimWordEmbeddingModelFileInteraction(FileInteraction):
@staticmethod
......
......@@ -31,6 +31,7 @@ class ITSupportDatasetWithBuilder:
return filetypes[filetype](filename)
# Combine the short and long description columns of tickets
def combine_summaries_with_descriptions(self):
combined_columns = []
for description, summary in zip(self.corpus['Description'].values, self.corpus['Incident Summary'].values):
......@@ -38,11 +39,13 @@ class ITSupportDatasetWithBuilder:
self.corpus['Description'] = combined_columns
# Remove null records from datasets
def __remove_nulls(self):
self.corpus.replace('[None]', None, inplace=True)
self.corpus.dropna(axis=0, subset=['Description', 'Impact', 'Urgency'], inplace=True, how='any')
self.corpus.fillna('', axis=1, inplace=True)
# Convert Impact & Urgency to P5 - P1
def add_overall_priority_column(self):
prio_to_num = {'Low': 0, 'Medium': 1, 'High': 2}
num_to_pnum = ['P5', 'P4', 'P3', 'P2', 'P1']
......@@ -73,148 +76,44 @@ class ITSupportDatasetBuilder:
self._dataset = ITSupportDatasetWithBuilder(*dataset_filenames)
# Combine ticket summary and main descriptions
def with_summaries_and_descriptions_combined(self):
self._dataset.combine_summaries_with_descriptions()
return self
# Convert Impact & Urgency to P1 - P5
def with_overall_priority_column(self):
self._dataset.add_overall_priority_column()
return self
# Pre-process descriptions
def with_pre_processed_descriptions(self):
self._dataset.pre_process_texts()
return self
# Return the dataset object
def build(self):
return self._dataset
def generate_synonyms(dataset: DataFrame, filename):
# Create an instance of the SynonymAug class
aug = naw.SynonymAug(aug_src='wordnet', verbose=True)
# Copy the original dataset
copied_dataset = dataset.copy()
# Apply the synonym function to each ticket
copied_dataset['Description'] = copied_dataset['Description'].apply(lambda doc: aug.augment(doc)[0])
# Export new dataset to csv
copied_dataset.to_csv(filename)
'''@dataclass
class ITSupportDataset:
"""Class for storing the IT Support Ticket Descriptions, Impacts, Urgencies, and Overall Priority"""
corpus = DataFrame
raw_dataset = DataFrame
def __init__(self, combined_title_description_requested: bool = False):
self.__get_raw_dataset()
self.__get_dataset(combined_title_description_requested)
self.__add_overall_priority_column()
def __get_raw_dataset(self):
self.raw_dataset = read_csv('C:\\Users\\Benjamin\\PycharmProjects\\DISSERTATION_ARTEFACT\\project_utilities'
'\\Datasets\\ITSupport_Tickets.csv')
#ticket_data_high_prio = read_csv('C:\\Users\\Benjamin\\PycharmProjects\\DISSERTATION_ARTEFACT\\project_utilities'
#'\\Datasets\\ITSupport_Tickets_High_Prio.csv')
#self.raw_dataset = ticket_data_low_prio
def __get_dataset(self, combined_title_description_requested: bool):
impacts = self.raw_dataset['Impact'].tolist()
urgencies = self.raw_dataset['Urgency'].tolist()
texts = self.raw_dataset['Description'].tolist()
if combined_title_description_requested:
summaries = self.raw_dataset['Incident_Summary'].tolist()
non_nulled_dataset = self.__remove_nulls_with_summaries(impacts, urgencies, texts, summaries)
else:
non_nulled_dataset = self.__remove_nulls(impacts, urgencies, texts)
self.corpus = DataFrame(non_nulled_dataset)
def __remove_nulls(self, impacts, urgencies, descriptions):
dict_corpus = {'Descriptions': [], 'Impacts': [], 'Urgencies': []}
for index in range(len(impacts)):
if not (impacts[index] is np.nan
or urgencies[index] is np.nan
or descriptions[index] is np.nan):
dict_corpus['Descriptions'].append(descriptions[index])
dict_corpus['Impacts'].append(impacts[index])
dict_corpus['Urgencies'].append(urgencies[index])
return dict_corpus
def __remove_nulls_with_summaries(self, impacts, urgencies, descriptions, summaries):
dict_corpus = {'Descriptions': [], 'Impacts': [], 'Urgencies': []}
for index in range(len(impacts)):
if not (impacts[index] is np.nan
or urgencies[index] is np.nan
or descriptions[index] is np.nan):
dict_corpus['Descriptions'].append(str(summaries[index]) + ' ' + str(descriptions[index]))
dict_corpus['Impacts'].append(impacts[index])
dict_corpus['Urgencies'].append(urgencies[index])
return dict_corpus
def __add_overall_priority_column(self):
prio_to_num = {'Low': 0, 'Medium': 1, 'High': 2}
num_to_pnum = ['P5', 'P4', 'P3', 'P2', 'P1']
pnums = []
for priorities in zip(self.corpus['Impacts'], self.corpus['Urgencies']):
numbered_priority = sum([prio_to_num[priorities[0]], prio_to_num[priorities[1]]])
pnums.append(num_to_pnum[numbered_priority])
self.corpus['Priorities'] = pnums'''
'''
#Previous method, more efficient, way more lines though
impacts = self.raw_dataset['Impact'].tolist()
urgencies = self.raw_dataset['Urgency'].tolist()
descriptions = self.raw_dataset['Description'].tolist()
summaries = self.raw_dataset['Incident_Summary'].tolist()'''
'''dict_corpus = {'Descriptions': [], 'Impacts': [], 'Urgencies': [], 'Summaries': []}
start1, start2, end1, end2 = 0, 0, 0, 0
start1 = time.perf_counter_ns()
for description, impact, urgency, summary in zip(descriptions, impacts, urgencies, summaries):
if not (impact is np.nan
or urgency is np.nan
or description is np.nan):
dict_corpus['Descriptions'].append(description)
dict_corpus['Impacts'].append(impact)
dict_corpus['Urgencies'].append(urgency)
dict_corpus['Summaries'].append(str(summary))
end1 = time.perf_counter_ns()'''
# start2 = time.perf_counter_ns()
# self.corpus = self.raw_dataset
# end2 = time.perf_counter_ns()
# timing1, timing2 = end1 - start1, end2 - start2
# print(f"Iterative: {timing1}, Pandas: {timing2}, difference = {abs(timing1-timing2)}")
# return dict_corpus
if __name__ == '__main__':
# obj = ITSupportDataset(combined_title_description_requested=False)
'''times = []
while True:
for x in range(100):
h1 = time.perf_counter_ns()
dataset = ITSupportDatasetBuilder().with_summaries_and_descriptions_combined().with_overall_priority_column().build()
times.append(time.perf_counter_ns() - h1)
# dataset = ITSupportDatasetBuilder().with_overall_priority_column().build()
print(numpy.mean(times))'''
# dataset = ITSupportDatasetBuilder().with_summaries_and_descriptions_combined().with_overall_priority_column().build()
'''ticket_data_low_prio = read_csv('C:\\Users\\Benjamin\\PycharmProjects\\DISSERTATION_ARTEFACT\\project_utilities'
'\\Datasets\\ITSupport_Tickets.csv')
ticket_data_high_prio = read_csv('C:\\Users\\Benjamin\\PycharmProjects\\DISSERTATION_ARTEFACT\\project_utilities'
'\\Datasets\\ITSupport_Tickets_High_Prio.csv')
corpus = concat([ticket_data_low_prio, ticket_data_high_prio])
corpus.to_pickle('corpus.pickle')'''
'''dataset = ITSupportDatasetBuilder().with_summaries_and_descriptions_combined().with_overall_priority_column().build()
print(dataset.corpus.shape)
dataset.corpus = dataset.corpus.reset_index().drop_duplicates(subset='index', keep='first').set_index('index')
print(dataset.corpus.shape)
print(dataset.corpus.loc[1])'''
# Load Dataset
dataset = ITSupportDatasetBuilder(
f"{DefaultConfig.absolute_project_root_path()}/project_utilities/Datasets/ITSupport_Tickets.csv",
f"{DefaultConfig.absolute_project_root_path()}/project_utilities/Datasets/ITSupport_Tickets_High_Prio.csv") \
f"{DefaultConfig.absolute_project_root_path()}/project_utilities/Datasets/ITSupport_Tickets_High_Prio.csv",
f"{DefaultConfig.absolute_project_root_path()}/project_utilities/Datasets/synonym_IT_tickets.csv") \
.with_summaries_and_descriptions_combined() \
.with_overall_priority_column() \
.build().corpus
print(dataset.shape)
print(dataset['Priority'].value_counts())
#generate_synonyms(dataset, 'Datasets/synonym_IT_tickets.csv')
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment