Some Dataset class changes to be more generic

added some frontend for the prioritiser

Some Dataset class changes to be more generic
added some frontend for the prioritiser
4c72cb41 · Benjamin · a764ebc7 · 4c72cb41 · 4c72cb41 · 4c72cb41
Commit 4c72cb41 authored Apr 21, 2023 by Benjamin
9 changed files
--- a/ITSupportTicketPrioritisers/DefaultDatasets_TFIDF_KerasCNN_ToCSV.py
+++ b/ITSupportTicketPrioritisers/DefaultDatasets_TFIDF_KerasCNN_ToCSV.py
+from sklearn.preprocessing import LabelEncoder
+from keras.utils import to_categorical
+from custom_models.classifiers.DL_classifiers import KerasCNN
+from custom_models.feature_selection_extraction.algorithmic_feature_extraction_selection import TFIDF_Model
+from project_utilities.evaluators import DetailedConfusionMatrix, AccuracyPerClass
+from project_utilities.model_interaction import SKLearnModelFileInteraction, KerasModelFileInteraction
+from project_utilities import predictionformats
+from project_utilities.my_datasets import ITSupportDatasetBuilder
+from projectsettings import DefaultConfig
+import pandas as pd
+# Load Dataset
+dataset = ITSupportDatasetBuilder(f"{DefaultConfig.absolute_project_root_path()}/project_utilities/Datasets/ITSupport_Tickets.csv",
+                                  f"{DefaultConfig.absolute_project_root_path()}/project_utilities/Datasets/ITSupport_Tickets_High_Prio.csv") \
+    .with_summaries_and_descriptions_combined() \
+    .with_overall_priority_column() \
+    .with_pre_processed_descriptions() \
+    .build().corpus
+# Load Pre-configured TF-IDF
+TFIDF_model = TFIDF_Model()
+TFIDF_model.from_file(
+    f'{DefaultConfig.absolute_project_root_path()}/custom_models/preconfigured_models/tfidf_model.joblib',
+    SKLearnModelFileInteraction())
+# Load Pre-configured Keras CNN
+CNN_model = KerasCNN()
+CNN_model.from_file(f'{DefaultConfig.absolute_project_root_path()}/custom_models/preconfigured_models/CNN_model.h5',
+                    KerasModelFileInteraction())
+# Split dataset into test and train
+X_train_str, X_test_str, y_train, y_test = TFIDF_model.split_dataset(0.1, dataset['Description'].tolist(),
+                                                                     dataset['Priority'].tolist())
+# Convert the Descriptions to Sparse Matrices, representative of text
+X_test = TFIDF_model.vectorize_descriptions(X_test_str)
+# vectorized_desc = TFIDF_Model.vectorize_description(self=TFIDF_model, description="WIFI network has lost connction across the whole campus, this needs fixing ASAP")
+encoder = LabelEncoder().fit(['P5', 'P4', 'P3', 'P2', 'P1'])
+'''y_train = to_categorical(encoder.transform(y_train))
+y_val = to_categorical(encoder.transform(y_test))'''
+# Make predictions
+encoded_predictions = CNN_model.make_predictions(X_test)
+decoded_predictions = encoder.inverse_transform(encoded_predictions.argmax(axis=1))
+# Represent accuracies
+'''confusion_matrix = DetailedConfusionMatrix(decoded_predictions, y_test, ['P5', 'P4', 'P3', 'P2', 'P1'])
+confusion_matrix.plot_confusion_matrix(fullscreen_requested=True)
+apc = AccuracyPerClass(decoded_predictions, y_test, ['P5', 'P4', 'P3', 'P2', 'P1'])
+apc.plot_confusion_matrix()'''
+# export predictions to file
+dict_descriptions_predictions = {'Description': X_test_str, 'PredictedPriority': decoded_predictions}
+formatted_predictions = pd.DataFrame(dict_descriptions_predictions)
+prediction_saver = predictionformats.ITSupportPredictionFormat()
+prediction_saver.load_predictions(formatted_predictions)
+filename = input("Enter filename: ")
+prediction_saver.save_predictions_to_file(filename, 'csv')
\ No newline at end of file
--- a/ITSupportTicketPrioritisers/FromCSV_TFIDF_KerasCNN_ToCSV.py
+++ b/ITSupportTicketPrioritisers/FromCSV_TFIDF_KerasCNN_ToCSV.py
+from sklearn.preprocessing import LabelEncoder
+from keras.utils import to_categorical
+from custom_models.classifiers.DL_classifiers import KerasCNN
+from custom_models.feature_selection_extraction.algorithmic_feature_extraction_selection import TFIDF_Model
+from project_utilities.evaluators import DetailedConfusionMatrix, AccuracyPerClass
+from project_utilities.model_interaction import SKLearnModelFileInteraction, KerasModelFileInteraction
+from project_utilities import predictionformats
+from project_utilities.my_datasets import ITSupportDatasetBuilder
+from projectsettings import DefaultConfig
+import pandas as pd
+'''f"{DefaultConfig.absolute_project_root_path()}/project_utilities/Datasets/ITSupport_Tickets.csv",
+f"{DefaultConfig.absolute_project_root_path()}/project_utilities/Datasets/ITSupport_Tickets_High_Prio.csv")'''
+dataset_file_loc = input("Enter dataset file_location: ")
+dataset = ITSupportDatasetBuilder(dataset_file_loc) \
+    .with_summaries_and_descriptions_combined() \
+    .with_overall_priority_column() \
+    .with_pre_processed_descriptions() \
+    .build().corpus
+TFIDF_model = TFIDF_Model()
+TFIDF_model.from_file(
+    f'{DefaultConfig.absolute_project_root_path()}/custom_models/preconfigured_models/tfidf_model.joblib',
+    SKLearnModelFileInteraction())
+CNN_model = KerasCNN()
+CNN_model.from_file(f'{DefaultConfig.absolute_project_root_path()}/custom_models/preconfigured_models/CNN_model.h5',
+                    KerasModelFileInteraction())
+#X_train_str, X_test_str, y_train, y_test = TFIDF_model.split_dataset(0.1, dataset['Description'].tolist(),
+#                                                                     dataset['Priority'].tolist())
+# X_train = TFIDF_model.vectorize_descriptions(X_train_str)
+X_test = TFIDF_model.vectorize_descriptions(dataset['Description'].tolist())
+# vectorised_descriptions = TFIDF_model.vectorize_descriptions(dataset['Description'].tolist())
+# X_train, X_test, y_train, y_test = TFIDF_model.split_dataset(0.1, vectorised_descriptions,
+# dataset['Priority'].tolist())
+# vectorized_desc = TFIDF_Model.vectorize_description(self=TFIDF_model, description="WIFI network has lost connction across the whole campus, this needs fixing ASAP")
+encoder = LabelEncoder().fit(['P5', 'P4', 'P3', 'P2', 'P1'])
+'''y_train = to_categorical(encoder.transform(y_train))
+y_val = to_categorical(encoder.transform(y_test))'''
+encoded_predictions = CNN_model.make_predictions(X_test)
+decoded_predictions = encoder.inverse_transform(encoded_predictions.argmax(axis=1))
+confusion_matrix = DetailedConfusionMatrix(decoded_predictions, dataset['Priority'].tolist(), ['P5', 'P4', 'P3', 'P2', 'P1'])
+confusion_matrix.plot_confusion_matrix(fullscreen_requested=True)
+try:
+    apc = AccuracyPerClass(decoded_predictions, dataset['Priority'].tolist(), ['P5', 'P4', 'P3', 'P2', 'P1'])
+    apc.plot_confusion_matrix()
+except ZeroDivisionError:
+    pass
+dict_descriptions_predictions = {'Description': dataset['Description'].tolist(), 'PredictedPriority': decoded_predictions}
+formatted_predictions = pd.DataFrame(dict_descriptions_predictions)
+prediction_saver = predictionformats.ITSupportPredictionFormat()
+prediction_saver.load_predictions(formatted_predictions)
+filename = input("Enter filename: ")
+prediction_saver.save_predictions_to_file(filename, 'csv')
--- a/ITSupportTicketPrioritisers/NoDataset_TFIDF_KerasCNN.py
+++ b/ITSupportTicketPrioritisers/NoDataset_TFIDF_KerasCNN.py
+from sklearn.preprocessing import LabelEncoder
+from custom_models.classifiers.DL_classifiers import KerasCNN
+from custom_models.feature_selection_extraction.algorithmic_feature_extraction_selection import TFIDF_Model
+from project_utilities.model_interaction import SKLearnModelFileInteraction, KerasModelFileInteraction
+from projectsettings import DefaultConfig
+from threading import Timer
+import sys
+# Load Pre-configured TF-IDF
+TFIDF_model = TFIDF_Model()
+TFIDF_model.from_file(
+    f'{DefaultConfig.absolute_project_root_path()}/custom_models/preconfigured_models/tfidf_model.joblib',
+    SKLearnModelFileInteraction())
+# Load Pre-configured Keras CNN
+CNN_model = KerasCNN()
+CNN_model.from_file(f'{DefaultConfig.absolute_project_root_path()}/custom_models/preconfigured_models/CNN_model.h5',
+                    KerasModelFileInteraction())
+# Convert P1-5 into categories the model understands
+encoder = LabelEncoder().fit(['P5', 'P4', 'P3', 'P2', 'P1'])
+timer_finished = False
+def amend_timer():
+    global timer_finished
+    timer_finished = True
+while True:
+    timer_finished = False
+    print("Paste IT issue here: ")
+    contents = []
+    t = Timer(1, amend_timer)
+    t.start()
+    while not timer_finished:
+        line = input()
+        contents.append(line)
+    t.cancel()
+    print("...")
+    description = ' '.join(contents)
+    # Convert the Descriptions to Sparse Matrices, representative of text
+    vectorized_desc = TFIDF_model.vectorize_description(description=repr(description))
+    # Make prediction
+    encoded_prediction = CNN_model.make_predictions(vectorized_desc)
+    # Convert prediction back to P5-P1
+    decoded_prediction = encoder.inverse_transform(encoded_prediction.argmax(axis=1))
+    print(decoded_prediction)
--- a/custom_models/feature_selection_extraction/ML_DL_feature_extraction_selection.py
+++ b/custom_models/feature_selection_extraction/ML_DL_feature_extraction_selection.py
@@ -14,7 +14,6 @@ from tqdm import tqdm
 from project_utilities import my_datasets
 import preprocessing_functionality
 class Doc2VecModels(Enum):
    DBOW = 1
    DM = 2
@@ -111,7 +110,7 @@ class ITSupportDoc2VecImplementation:
 if __name__ == '__main__':
-    dataset = my_datasets.ITSupportDatasetBuilder()\
+    '''dataset = my_datasets.ITSupportDatasetBuilder()\
        .with_overall_priority_column()\
        .with_summaries_and_descriptions_combined()\
        .with_pre_processed_descriptions()\
@@ -126,5 +125,5 @@ if __name__ == '__main__':
    print("time: " + str(time.perf_counter() - t1))
    doc2vec_IT.generate_vectors()
    print(doc2vec_IT.tagged_training_documents[50])
-    #print(doc2vec_IT.X_test)
+    #print(doc2vec_IT.X_test)'''
--- a/main.py
+++ b/main.py
-from sklearn.preprocessing import LabelEncoder
+import ITSupportTicketPrioritisers.FromCSV_TFIDF_KerasCNN_ToCSV
-from keras.utils import to_categorical
+#import ITSupportTicketPrioritisers.DefaultDatasets_TFIDF_KerasCNN_ToCSV
-from custom_models.classifiers.DL_classifiers import KerasCNN
+#import ITSupportTicketPrioritisers.NoDataset_TFIDF_KerasCNN
-from custom_models.feature_selection_extraction.algorithmic_feature_extraction_selection import TFIDF_Model
-from project_utilities.evaluators import DetailedConfusionMatrix, AccuracyPerClass
-from project_utilities.model_interaction import SKLearnModelFileInteraction, KerasModelFileInteraction
-from project_utilities.my_datasets import ITSupportDatasetBuilder
-dataset = ITSupportDatasetBuilder() \
-    .with_summaries_and_descriptions_combined() \
-    .with_overall_priority_column() \
-    .with_pre_processed_descriptions() \
-    .build().corpus
-TFIDF_model = TFIDF_Model()
-TFIDF_model.from_file('custom_models/feature_selection_extraction/tfidf_model.joblib', SKLearnModelFileInteraction())
-CNN_model = KerasCNN()
-CNN_model.from_file('custom_models/classifiers/CNN_model.h5', KerasModelFileInteraction())
-vectorised_descriptions = TFIDF_model.vectorize_descriptions(dataset['Description'].tolist())
-X_train, X_test, y_train, y_test = TFIDF_model.split_dataset(0.1, vectorised_descriptions,
-                                                             dataset['Priority'].tolist())
-#vectorized_desc = TFIDF_Model.vectorize_description(self=TFIDF_model, description="WIFI network has lost connction across the whole campus, this needs fixing ASAP")
-encoder = LabelEncoder().fit(['P5', 'P4', 'P3', 'P2', 'P1'])
-'''y_train = to_categorical(encoder.transform(y_train))
-y_val = to_categorical(encoder.transform(y_test))'''
-encoded_predictions = CNN_model.make_predictions(X_test)
-decoded_predictions = encoder.inverse_transform(encoded_predictions.argmax(axis=1))
-confusion_matrix = DetailedConfusionMatrix(decoded_predictions, y_test, ['P5', 'P4', 'P3', 'P2', 'P1'])
-confusion_matrix.plot_confusion_matrix(fullscreen_requested=True)
-apc = AccuracyPerClass(decoded_predictions, y_test, ['P5', 'P4', 'P3', 'P2', 'P1'])
-apc.plot_confusion_matrix()
--- a/project_utilities/model_interaction.py
+++ b/project_utilities/model_interaction.py
@@ -2,7 +2,8 @@ from abc import ABC, abstractmethod
 import joblib
 from keras.models import load_model, save_model
-class ModelFileInteraction(ABC):
+class FileInteraction(ABC):
    @staticmethod
    @abstractmethod
    def load_from_file(filename):
@@ -14,7 +15,7 @@ class ModelFileInteraction(ABC):
        pass
-class SKLearnModelFileInteraction(ModelFileInteraction):
+class SKLearnModelFileInteraction(FileInteraction):
    @staticmethod
    def load_from_file(filename):
@@ -25,11 +26,11 @@ class SKLearnModelFileInteraction(ModelFileInteraction):
        joblib.dump(model, filename)
-class KerasModelFileInteraction(ModelFileInteraction):
+class KerasModelFileInteraction(FileInteraction):
    @staticmethod
    def load_from_file(filename):
        return load_model(filename)
    @staticmethod
    def load_to_file(model, filename):
-        model.save(filename)
+        save_model(model, filename)
--- a/project_utilities/my_datasets.py
+++ b/project_utilities/my_datasets.py
+from pandas import read_csv, DataFrame, concat, read_json, read_excel
-from pandas import read_csv, DataFrame, concat
 from dataclasses import dataclass
 import preprocessing_functionality
+from projectsettings import DefaultConfig
 @dataclass
@@ -12,21 +12,34 @@ class ITSupportDatasetWithBuilder:
    Contains an associated Builder Class for flexible object creation."""
    corpus = DataFrame
-    def __init__(self):
+    def __init__(self, *dataset_paths):
-        self.__get_raw_dataset()
+        self.__get_raw_dataset(*dataset_paths)
        self.__remove_nulls()
-        #self.corpus = self.corpus.reset_index().drop_duplicates(subset='index', keep='first').set_index('index')
-    def __get_raw_dataset(self):
+    def __get_raw_dataset(self, *other_dataset_paths):
-        ticket_data_low_prio = read_csv("C:\\Users\\Benjamin\\PycharmProjects\\DISSERTATION_ARTEFACT"
+        if not other_dataset_paths:
-                                        "\\project_utilities\\Datasets\\ITSupport_Tickets.csv")
+            ticket_data_low_prio = read_csv(
-        ticket_data_high_prio = read_csv("C:\\Users\\Benjamin\\PycharmProjects\\DISSERTATION_ARTEFACT"
+                f"{DefaultConfig.absolute_project_root_path()}/project_utilities/Datasets/ITSupport_Tickets.csv")
-                                         "\\project_utilities\\Datasets\\ITSupport_Tickets_High_Prio.csv")
+            ticket_data_high_prio = read_csv(
-        self.corpus = concat([ticket_data_low_prio, ticket_data_high_prio])
+                f"{DefaultConfig.absolute_project_root_path()}/project_utilities/Datasets/ITSupport_Tickets_High_Prio.csv")
+            datasets = [ticket_data_low_prio, ticket_data_high_prio]
+        else:
+            datasets = [self.load_from_file(file) for file in other_dataset_paths]
+        self.corpus = concat(datasets)
+        print(self.corpus)
+    @staticmethod
+    def load_from_file(filename):
+        filetype = filename.split('.')[1].lower()
+        filetypes = {'csv': read_csv,
+                     'xlsx': read_excel,
+                     'json': read_json}
+        return filetypes[filetype](filename)
    def combine_summaries_with_descriptions(self):
        combined_columns = []
-        for description, summary in zip(self.corpus['Description'].values, self.corpus['Incident_Summary'].values):
+        for description, summary in zip(self.corpus['Description'].values, self.corpus['Incident Summary'].values):
            combined_columns.append(str(summary) + ' ' + str(description))
        self.corpus['Description'] = combined_columns
@@ -58,9 +71,13 @@ class ITSupportDatasetWithBuilder:
        self.corpus['Description'] = self.corpus['Description'].apply(preprocessing_functionality.stem_text)
-class ITSupportDatasetBuilder(object):
+class ITSupportDatasetBuilder:
-    def __init__(self):
+    def __init__(self, *dataset_filenames):
-        self._dataset = ITSupportDatasetWithBuilder()
+        if not dataset_filenames:
+            self._dataset = ITSupportDatasetWithBuilder()
+            return
+        self._dataset = ITSupportDatasetWithBuilder(*dataset_filenames)
    def with_summaries_and_descriptions_combined(self):
        self._dataset.combine_summaries_with_descriptions()

--- a/project_utilities/predictionformats.py
+++ b/project_utilities/predictionformats.py
+import pandas as pd
+class ITSupportPredictionFormat:
+    predictions: pd.DataFrame
+    def __init__(self):
+        prediction_format = {'Description': [], 'PredictedPriority': []}
+        self.predictions = pd.DataFrame(prediction_format)
+    def load_predictions(self, new_predictions: pd.DataFrame):
+        self.predictions = pd.concat([self.predictions, new_predictions])
+    def save_predictions_to_file(self, filename: str, filetype: str):
+        filetypes = {'csv': self._to_csv,
+                     'xlsx': self._to_excel}
+        filetypes[filetype](filename)
+    def _to_csv(self, filename):
+        self.predictions = self.predictions.apply(lambda x: x.replace(",", " "))
+        self.predictions.to_csv(filename + '.csv', sep=",", columns=['Description', 'PredictedPriority'])
+    def _to_excel(self, filename):
+        self.predictions.to_excel(filename + '.xlsx', sep=",", columns=['Description', 'PredictedPriority'])
--- a/projectsettings.py
+++ b/projectsettings.py
+import os
+class DefaultConfig:
+    @staticmethod
+    def absolute_project_root_path():
+        return os.path.dirname(__file__).replace("\\", "/")