Commit 4c72cb41 authored by Benjamin's avatar Benjamin

Some Dataset class changes to be more generic

added some frontend for the prioritiser
parent a764ebc7
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical
from custom_models.classifiers.DL_classifiers import KerasCNN
from custom_models.feature_selection_extraction.algorithmic_feature_extraction_selection import TFIDF_Model
from project_utilities.evaluators import DetailedConfusionMatrix, AccuracyPerClass
from project_utilities.model_interaction import SKLearnModelFileInteraction, KerasModelFileInteraction
from project_utilities import predictionformats
from project_utilities.my_datasets import ITSupportDatasetBuilder
from projectsettings import DefaultConfig
import pandas as pd
# Load Dataset
dataset = ITSupportDatasetBuilder(f"{DefaultConfig.absolute_project_root_path()}/project_utilities/Datasets/ITSupport_Tickets.csv",
f"{DefaultConfig.absolute_project_root_path()}/project_utilities/Datasets/ITSupport_Tickets_High_Prio.csv") \
.with_summaries_and_descriptions_combined() \
.with_overall_priority_column() \
.with_pre_processed_descriptions() \
.build().corpus
# Load Pre-configured TF-IDF
TFIDF_model = TFIDF_Model()
TFIDF_model.from_file(
f'{DefaultConfig.absolute_project_root_path()}/custom_models/preconfigured_models/tfidf_model.joblib',
SKLearnModelFileInteraction())
# Load Pre-configured Keras CNN
CNN_model = KerasCNN()
CNN_model.from_file(f'{DefaultConfig.absolute_project_root_path()}/custom_models/preconfigured_models/CNN_model.h5',
KerasModelFileInteraction())
# Split dataset into test and train
X_train_str, X_test_str, y_train, y_test = TFIDF_model.split_dataset(0.1, dataset['Description'].tolist(),
dataset['Priority'].tolist())
# Convert the Descriptions to Sparse Matrices, representative of text
X_test = TFIDF_model.vectorize_descriptions(X_test_str)
# vectorized_desc = TFIDF_Model.vectorize_description(self=TFIDF_model, description="WIFI network has lost connction across the whole campus, this needs fixing ASAP")
encoder = LabelEncoder().fit(['P5', 'P4', 'P3', 'P2', 'P1'])
'''y_train = to_categorical(encoder.transform(y_train))
y_val = to_categorical(encoder.transform(y_test))'''
# Make predictions
encoded_predictions = CNN_model.make_predictions(X_test)
decoded_predictions = encoder.inverse_transform(encoded_predictions.argmax(axis=1))
# Represent accuracies
'''confusion_matrix = DetailedConfusionMatrix(decoded_predictions, y_test, ['P5', 'P4', 'P3', 'P2', 'P1'])
confusion_matrix.plot_confusion_matrix(fullscreen_requested=True)
apc = AccuracyPerClass(decoded_predictions, y_test, ['P5', 'P4', 'P3', 'P2', 'P1'])
apc.plot_confusion_matrix()'''
# export predictions to file
dict_descriptions_predictions = {'Description': X_test_str, 'PredictedPriority': decoded_predictions}
formatted_predictions = pd.DataFrame(dict_descriptions_predictions)
prediction_saver = predictionformats.ITSupportPredictionFormat()
prediction_saver.load_predictions(formatted_predictions)
filename = input("Enter filename: ")
prediction_saver.save_predictions_to_file(filename, 'csv')
\ No newline at end of file
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical
from custom_models.classifiers.DL_classifiers import KerasCNN
from custom_models.feature_selection_extraction.algorithmic_feature_extraction_selection import TFIDF_Model
from project_utilities.evaluators import DetailedConfusionMatrix, AccuracyPerClass
from project_utilities.model_interaction import SKLearnModelFileInteraction, KerasModelFileInteraction
from project_utilities import predictionformats
from project_utilities.my_datasets import ITSupportDatasetBuilder
from projectsettings import DefaultConfig
import pandas as pd
'''f"{DefaultConfig.absolute_project_root_path()}/project_utilities/Datasets/ITSupport_Tickets.csv",
f"{DefaultConfig.absolute_project_root_path()}/project_utilities/Datasets/ITSupport_Tickets_High_Prio.csv")'''
dataset_file_loc = input("Enter dataset file_location: ")
dataset = ITSupportDatasetBuilder(dataset_file_loc) \
.with_summaries_and_descriptions_combined() \
.with_overall_priority_column() \
.with_pre_processed_descriptions() \
.build().corpus
TFIDF_model = TFIDF_Model()
TFIDF_model.from_file(
f'{DefaultConfig.absolute_project_root_path()}/custom_models/preconfigured_models/tfidf_model.joblib',
SKLearnModelFileInteraction())
CNN_model = KerasCNN()
CNN_model.from_file(f'{DefaultConfig.absolute_project_root_path()}/custom_models/preconfigured_models/CNN_model.h5',
KerasModelFileInteraction())
#X_train_str, X_test_str, y_train, y_test = TFIDF_model.split_dataset(0.1, dataset['Description'].tolist(),
# dataset['Priority'].tolist())
# X_train = TFIDF_model.vectorize_descriptions(X_train_str)
X_test = TFIDF_model.vectorize_descriptions(dataset['Description'].tolist())
# vectorised_descriptions = TFIDF_model.vectorize_descriptions(dataset['Description'].tolist())
# X_train, X_test, y_train, y_test = TFIDF_model.split_dataset(0.1, vectorised_descriptions,
# dataset['Priority'].tolist())
# vectorized_desc = TFIDF_Model.vectorize_description(self=TFIDF_model, description="WIFI network has lost connction across the whole campus, this needs fixing ASAP")
encoder = LabelEncoder().fit(['P5', 'P4', 'P3', 'P2', 'P1'])
'''y_train = to_categorical(encoder.transform(y_train))
y_val = to_categorical(encoder.transform(y_test))'''
encoded_predictions = CNN_model.make_predictions(X_test)
decoded_predictions = encoder.inverse_transform(encoded_predictions.argmax(axis=1))
confusion_matrix = DetailedConfusionMatrix(decoded_predictions, dataset['Priority'].tolist(), ['P5', 'P4', 'P3', 'P2', 'P1'])
confusion_matrix.plot_confusion_matrix(fullscreen_requested=True)
try:
apc = AccuracyPerClass(decoded_predictions, dataset['Priority'].tolist(), ['P5', 'P4', 'P3', 'P2', 'P1'])
apc.plot_confusion_matrix()
except ZeroDivisionError:
pass
dict_descriptions_predictions = {'Description': dataset['Description'].tolist(), 'PredictedPriority': decoded_predictions}
formatted_predictions = pd.DataFrame(dict_descriptions_predictions)
prediction_saver = predictionformats.ITSupportPredictionFormat()
prediction_saver.load_predictions(formatted_predictions)
filename = input("Enter filename: ")
prediction_saver.save_predictions_to_file(filename, 'csv')
from sklearn.preprocessing import LabelEncoder
from custom_models.classifiers.DL_classifiers import KerasCNN
from custom_models.feature_selection_extraction.algorithmic_feature_extraction_selection import TFIDF_Model
from project_utilities.model_interaction import SKLearnModelFileInteraction, KerasModelFileInteraction
from projectsettings import DefaultConfig
from threading import Timer
import sys
# Load Pre-configured TF-IDF
TFIDF_model = TFIDF_Model()
TFIDF_model.from_file(
f'{DefaultConfig.absolute_project_root_path()}/custom_models/preconfigured_models/tfidf_model.joblib',
SKLearnModelFileInteraction())
# Load Pre-configured Keras CNN
CNN_model = KerasCNN()
CNN_model.from_file(f'{DefaultConfig.absolute_project_root_path()}/custom_models/preconfigured_models/CNN_model.h5',
KerasModelFileInteraction())
# Convert P1-5 into categories the model understands
encoder = LabelEncoder().fit(['P5', 'P4', 'P3', 'P2', 'P1'])
timer_finished = False
def amend_timer():
global timer_finished
timer_finished = True
while True:
timer_finished = False
print("Paste IT issue here: ")
contents = []
t = Timer(1, amend_timer)
t.start()
while not timer_finished:
line = input()
contents.append(line)
t.cancel()
print("...")
description = ' '.join(contents)
# Convert the Descriptions to Sparse Matrices, representative of text
vectorized_desc = TFIDF_model.vectorize_description(description=repr(description))
# Make prediction
encoded_prediction = CNN_model.make_predictions(vectorized_desc)
# Convert prediction back to P5-P1
decoded_prediction = encoder.inverse_transform(encoded_prediction.argmax(axis=1))
print(decoded_prediction)
...@@ -14,7 +14,6 @@ from tqdm import tqdm ...@@ -14,7 +14,6 @@ from tqdm import tqdm
from project_utilities import my_datasets from project_utilities import my_datasets
import preprocessing_functionality import preprocessing_functionality
class Doc2VecModels(Enum): class Doc2VecModels(Enum):
DBOW = 1 DBOW = 1
DM = 2 DM = 2
...@@ -111,7 +110,7 @@ class ITSupportDoc2VecImplementation: ...@@ -111,7 +110,7 @@ class ITSupportDoc2VecImplementation:
if __name__ == '__main__': if __name__ == '__main__':
dataset = my_datasets.ITSupportDatasetBuilder()\ '''dataset = my_datasets.ITSupportDatasetBuilder()\
.with_overall_priority_column()\ .with_overall_priority_column()\
.with_summaries_and_descriptions_combined()\ .with_summaries_and_descriptions_combined()\
.with_pre_processed_descriptions()\ .with_pre_processed_descriptions()\
...@@ -126,5 +125,5 @@ if __name__ == '__main__': ...@@ -126,5 +125,5 @@ if __name__ == '__main__':
print("time: " + str(time.perf_counter() - t1)) print("time: " + str(time.perf_counter() - t1))
doc2vec_IT.generate_vectors() doc2vec_IT.generate_vectors()
print(doc2vec_IT.tagged_training_documents[50]) print(doc2vec_IT.tagged_training_documents[50])
#print(doc2vec_IT.X_test) #print(doc2vec_IT.X_test)'''
from sklearn.preprocessing import LabelEncoder import ITSupportTicketPrioritisers.FromCSV_TFIDF_KerasCNN_ToCSV
from keras.utils import to_categorical #import ITSupportTicketPrioritisers.DefaultDatasets_TFIDF_KerasCNN_ToCSV
from custom_models.classifiers.DL_classifiers import KerasCNN #import ITSupportTicketPrioritisers.NoDataset_TFIDF_KerasCNN
from custom_models.feature_selection_extraction.algorithmic_feature_extraction_selection import TFIDF_Model
from project_utilities.evaluators import DetailedConfusionMatrix, AccuracyPerClass
from project_utilities.model_interaction import SKLearnModelFileInteraction, KerasModelFileInteraction
from project_utilities.my_datasets import ITSupportDatasetBuilder
dataset = ITSupportDatasetBuilder() \
.with_summaries_and_descriptions_combined() \
.with_overall_priority_column() \
.with_pre_processed_descriptions() \
.build().corpus
TFIDF_model = TFIDF_Model()
TFIDF_model.from_file('custom_models/feature_selection_extraction/tfidf_model.joblib', SKLearnModelFileInteraction())
CNN_model = KerasCNN()
CNN_model.from_file('custom_models/classifiers/CNN_model.h5', KerasModelFileInteraction())
vectorised_descriptions = TFIDF_model.vectorize_descriptions(dataset['Description'].tolist())
X_train, X_test, y_train, y_test = TFIDF_model.split_dataset(0.1, vectorised_descriptions,
dataset['Priority'].tolist())
#vectorized_desc = TFIDF_Model.vectorize_description(self=TFIDF_model, description="WIFI network has lost connction across the whole campus, this needs fixing ASAP")
encoder = LabelEncoder().fit(['P5', 'P4', 'P3', 'P2', 'P1'])
'''y_train = to_categorical(encoder.transform(y_train))
y_val = to_categorical(encoder.transform(y_test))'''
encoded_predictions = CNN_model.make_predictions(X_test)
decoded_predictions = encoder.inverse_transform(encoded_predictions.argmax(axis=1))
confusion_matrix = DetailedConfusionMatrix(decoded_predictions, y_test, ['P5', 'P4', 'P3', 'P2', 'P1'])
confusion_matrix.plot_confusion_matrix(fullscreen_requested=True)
apc = AccuracyPerClass(decoded_predictions, y_test, ['P5', 'P4', 'P3', 'P2', 'P1'])
apc.plot_confusion_matrix()
...@@ -2,7 +2,8 @@ from abc import ABC, abstractmethod ...@@ -2,7 +2,8 @@ from abc import ABC, abstractmethod
import joblib import joblib
from keras.models import load_model, save_model from keras.models import load_model, save_model
class ModelFileInteraction(ABC):
class FileInteraction(ABC):
@staticmethod @staticmethod
@abstractmethod @abstractmethod
def load_from_file(filename): def load_from_file(filename):
...@@ -14,7 +15,7 @@ class ModelFileInteraction(ABC): ...@@ -14,7 +15,7 @@ class ModelFileInteraction(ABC):
pass pass
class SKLearnModelFileInteraction(ModelFileInteraction): class SKLearnModelFileInteraction(FileInteraction):
@staticmethod @staticmethod
def load_from_file(filename): def load_from_file(filename):
...@@ -25,11 +26,11 @@ class SKLearnModelFileInteraction(ModelFileInteraction): ...@@ -25,11 +26,11 @@ class SKLearnModelFileInteraction(ModelFileInteraction):
joblib.dump(model, filename) joblib.dump(model, filename)
class KerasModelFileInteraction(ModelFileInteraction): class KerasModelFileInteraction(FileInteraction):
@staticmethod @staticmethod
def load_from_file(filename): def load_from_file(filename):
return load_model(filename) return load_model(filename)
@staticmethod @staticmethod
def load_to_file(model, filename): def load_to_file(model, filename):
model.save(filename) save_model(model, filename)
from pandas import read_csv, DataFrame, concat, read_json, read_excel
from pandas import read_csv, DataFrame, concat
from dataclasses import dataclass from dataclasses import dataclass
import preprocessing_functionality import preprocessing_functionality
from projectsettings import DefaultConfig
@dataclass @dataclass
...@@ -12,21 +12,34 @@ class ITSupportDatasetWithBuilder: ...@@ -12,21 +12,34 @@ class ITSupportDatasetWithBuilder:
Contains an associated Builder Class for flexible object creation.""" Contains an associated Builder Class for flexible object creation."""
corpus = DataFrame corpus = DataFrame
def __init__(self): def __init__(self, *dataset_paths):
self.__get_raw_dataset() self.__get_raw_dataset(*dataset_paths)
self.__remove_nulls() self.__remove_nulls()
#self.corpus = self.corpus.reset_index().drop_duplicates(subset='index', keep='first').set_index('index')
def __get_raw_dataset(self): def __get_raw_dataset(self, *other_dataset_paths):
ticket_data_low_prio = read_csv("C:\\Users\\Benjamin\\PycharmProjects\\DISSERTATION_ARTEFACT" if not other_dataset_paths:
"\\project_utilities\\Datasets\\ITSupport_Tickets.csv") ticket_data_low_prio = read_csv(
ticket_data_high_prio = read_csv("C:\\Users\\Benjamin\\PycharmProjects\\DISSERTATION_ARTEFACT" f"{DefaultConfig.absolute_project_root_path()}/project_utilities/Datasets/ITSupport_Tickets.csv")
"\\project_utilities\\Datasets\\ITSupport_Tickets_High_Prio.csv") ticket_data_high_prio = read_csv(
self.corpus = concat([ticket_data_low_prio, ticket_data_high_prio]) f"{DefaultConfig.absolute_project_root_path()}/project_utilities/Datasets/ITSupport_Tickets_High_Prio.csv")
datasets = [ticket_data_low_prio, ticket_data_high_prio]
else:
datasets = [self.load_from_file(file) for file in other_dataset_paths]
self.corpus = concat(datasets)
print(self.corpus)
@staticmethod
def load_from_file(filename):
filetype = filename.split('.')[1].lower()
filetypes = {'csv': read_csv,
'xlsx': read_excel,
'json': read_json}
return filetypes[filetype](filename)
def combine_summaries_with_descriptions(self): def combine_summaries_with_descriptions(self):
combined_columns = [] combined_columns = []
for description, summary in zip(self.corpus['Description'].values, self.corpus['Incident_Summary'].values): for description, summary in zip(self.corpus['Description'].values, self.corpus['Incident Summary'].values):
combined_columns.append(str(summary) + ' ' + str(description)) combined_columns.append(str(summary) + ' ' + str(description))
self.corpus['Description'] = combined_columns self.corpus['Description'] = combined_columns
...@@ -58,9 +71,13 @@ class ITSupportDatasetWithBuilder: ...@@ -58,9 +71,13 @@ class ITSupportDatasetWithBuilder:
self.corpus['Description'] = self.corpus['Description'].apply(preprocessing_functionality.stem_text) self.corpus['Description'] = self.corpus['Description'].apply(preprocessing_functionality.stem_text)
class ITSupportDatasetBuilder(object): class ITSupportDatasetBuilder:
def __init__(self): def __init__(self, *dataset_filenames):
self._dataset = ITSupportDatasetWithBuilder() if not dataset_filenames:
self._dataset = ITSupportDatasetWithBuilder()
return
self._dataset = ITSupportDatasetWithBuilder(*dataset_filenames)
def with_summaries_and_descriptions_combined(self): def with_summaries_and_descriptions_combined(self):
self._dataset.combine_summaries_with_descriptions() self._dataset.combine_summaries_with_descriptions()
......
import pandas as pd
class ITSupportPredictionFormat:
predictions: pd.DataFrame
def __init__(self):
prediction_format = {'Description': [], 'PredictedPriority': []}
self.predictions = pd.DataFrame(prediction_format)
def load_predictions(self, new_predictions: pd.DataFrame):
self.predictions = pd.concat([self.predictions, new_predictions])
def save_predictions_to_file(self, filename: str, filetype: str):
filetypes = {'csv': self._to_csv,
'xlsx': self._to_excel}
filetypes[filetype](filename)
def _to_csv(self, filename):
self.predictions = self.predictions.apply(lambda x: x.replace(",", " "))
self.predictions.to_csv(filename + '.csv', sep=",", columns=['Description', 'PredictedPriority'])
def _to_excel(self, filename):
self.predictions.to_excel(filename + '.xlsx', sep=",", columns=['Description', 'PredictedPriority'])
import os
class DefaultConfig:
@staticmethod
def absolute_project_root_path():
return os.path.dirname(__file__).replace("\\", "/")
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment