Commit 109d33b1 authored by Benjamin's avatar Benjamin

OOPified many classes.

Exported some models to files
parent ca7c93f5
import keras
import numpy
import pandas as pd
from project_utilities.ModelTemplates import KerasDeepLearningModel
from project_utilities.my_datasets import ITSupportDatasetBuilder
from custom_models.feature_selection_extraction.algorithmic_feature_extraction_selection import TFIDF_Model
from project_utilities.model_interaction import KerasModelFileInteraction, SKLearnModelFileInteraction
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical
import custom_models.feature_selection_extraction.ML_DL_feature_extraction_selection
import custom_models.feature_selection_extraction.algorithmic_feature_extraction_selection
from project_utilities import my_datasets
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras import layers
from keras.backend import clear_session
import tensorflow as tf
from keras import metrics
from pandas import DataFrame
class KerasCNN(KerasDeepLearningModel):
def __init__(self, model=None):
super().__init__()
def train_model(self, vectors, labels, test_vectors, test_labels, epochs=50, batch_size=50, callbacks=None):
if not callbacks:
self.model.fit(vectors, labels, epochs=50, batch_size=50, validation_data=(test_vectors, test_labels), )
self.model.fit(vectors, labels, epochs=50, batch_size=50, validation_data=(test_vectors, test_labels),
callbacks=[callbacks])
class PresetSoftmaxClassifier:
vectorized_dataset = DataFrame
classes = list
def make_predictions(self, vectors):
return self.model.predict(vectors)
def __init__(self, vectorized_dataset, classes: list):
self.vectorized_dataset = vectorized_dataset
self.classes = classes
if __name__ == '__main__':
dataset = my_datasets.ITSupportDatasetBuilder() \
if __name__ == "__main__":
dataset = ITSupportDatasetBuilder() \
.with_summaries_and_descriptions_combined() \
.with_overall_priority_column() \
.with_pre_processed_descriptions() \
.build().corpus
doc2vec_IT = custom_models.feature_selection_extraction.ML_DL_feature_extraction_selection.ITSupportDoc2VecImplementation(
dataset=dataset, model_type=custom_models.feature_selection_extraction.ML_DL_feature_extraction_selection.Doc2VecModels.DBOW)
# doc2vec_IT.pre_process_texts()
doc2vec_IT.tag_documents()
doc2vec_IT.create_model()
doc2vec_IT.build_vocabulary()
doc2vec_IT.train_model(dataset_shuffles=1, epochs=10) # dataset_shuffles=10, epochs=30)
print("Got here 0.5")
doc2vec_IT.generate_vectors()
Z = tf.keras.utils.to_categorical(dataset.Priority, num_classes=5)
print(Z)
'''descriptions_train, descriptions_test, tfidf.training_labels, tfidf.testing_labels = train_test_split(
dataset.Descriptions, Z, test_size=0.3,
random_state=1000)
vectorizer.fit(descriptions_train)
tfidf.training_descriptions = vectorizer.transform(descriptions_train)
tfidf.testing_descriptions = vectorizer.transform(descriptions_test)'''
# tfidf.training_labels = tf.keras.utils.to_categorical(tfidf.training_labels, num_classes=5)
print(dataset.train_labels)
# vectorizer.fit(tfidf.training_labels)
input_dim = dataset.training_descriptions.shape[1]#tfidf.training_descriptions.shape[1]
model = Sequential()
model.add(layers.Dense(10, input_dim=input_dim, activation='relu'))
model.add(layers.Dense(5, activation='softmax'))
model.compile(loss='binary_crossentropy',
optimizer='adam',
metrics=[metrics.Recall()])
# model.summary()
history = model.fit(tfidf.training_descriptions, tfidf.training_labels,
epochs=100,
verbose=False,
validation_data=(tfidf.testing_descriptions, tfidf.testing_labels),
batch_size=5)
TFIDF_model = TFIDF_Model()
TFIDF_model.from_file('/custom_models/feature_selection_extraction/tfidf_model.joblib',
SKLearnModelFileInteraction())
loss, accuracy = model.evaluate(tfidf.testing_descriptions, tfidf.testing_labels, verbose=False)
print("Testing Accuracy: {:.4f}".format(accuracy))
CNN_model = KerasCNN()
CNN_model.from_file('custom_models/classifiers/CNN_model.h5', KerasModelFileInteraction())
# matrix = metrics.confusion_matrix(tfidf.testing_labels.argmax(axis=1), y_prediction.argmax(axis=1))
y_prediction = model.predict(tfidf.testing_descriptions)
y_prediction = numpy.argmax(y_prediction, axis=1)
tfidf.testing_labels = numpy.argmax(tfidf.testing_labels, axis=1)
print(keras.metrics.categorical_accuracy(tfidf.testing_labels, y_prediction))
# tf.keras.metrics.confusion_matrix(tfidf.testing_labels.argmax(axis=1), y_prediction.argmax(axis=1))
vectorised_descriptions = TFIDF_model.vectorize_descriptions(dataset['Description'].tolist())
X_train, X_test, y_train, y_test = TFIDF_model.split_dataset(0.1, vectorised_descriptions,
dataset['Priority'].tolist())
encoder = LabelEncoder().fit(['P5', 'P4', 'P3', 'P2', 'P1'])
y_train = to_categorical(encoder.transform(y_train))
y_val = to_categorical(encoder.transform(y_test))
# cm = ITSupportPriorityConfusionMatrixEvaluator(predictions=y_prediction, actual_values=tfidf.testing_labels, labels=['P1', 'P2', 'P3', 'P4', 'P5'])
# clear_session()
encoded_predictions = CNN_model.make_predictions(X_test)
decoded_predictions = encoder.inverse_transform(encoded_predictions.argmax(axis=1))
# keras.metrics.confusion_matrix(tfidf.testing_labels, y_prediction)
'''from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(tfidf.training_descriptions, tfidf.training_labels)
score = classifier.score(tfidf.testing_descriptions, tfidf.testing_labels)'''
from scikitplot.metrics import plot_confusion_matrix
import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=(5, 5))
num_to_pnum = ['P5', 'P4', 'P3', 'P2', 'P1']
tfidf.testing_labels_lab = [num_to_pnum[x] for x in tfidf.testing_labels]
y_pred_lab = [num_to_pnum[x] for x in y_prediction]
# print(tfidf.testing_labels_lab, type(tfidf.testing_labels))
# plot_confusion_matrix(tfidf.testing_labels_lab, y_pred_lab, ax=ax, labels=['P1', 'P2', 'P3', 'P4', 'P5'])
# plt.show()
from project_utilities.evaluators import ITSupportPriorityConfusionMatrixEvaluator
cm = ITSupportPriorityConfusionMatrixEvaluator(
predictions=y_pred_lab,
actual_values=tfidf.testing_labels_lab,
labels=['P1', 'P2', 'P3', 'P4', 'P5'])
cm.plot_confusion_matrix(fullscreen_requested=True)
from enum import Enum
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from project_utilities.ModelTemplates import SKLearnMachineLearningModel
class ModelType(Enum):
MULTINOMIAL_LOGISTIC_REGRESSION = 1
MULTINOMIAL_NAIVE_BAYES = 2
LINEAR_SUPPORT_VECTOR_CLASSIFICATION = 3
RANDOM_FOREST = 4
class ITMultinomialLogisticRegression(SKLearnMachineLearningModel):
def __init__(self, inverse_regularisation_strength: float = 1e5, cores_allocated: int = 1):
super().__init__(LogisticRegression(n_jobs=cores_allocated,
C=inverse_regularisation_strength,
multi_class='multinomial',
solver='newton-cg',
verbose=1))
class ITMachineLearningClassifierImplementation:
cores_allocated: int
class ITMultinomialNaiveBayes(SKLearnMachineLearningModel):
def __init__(self):
super().__init__(MultinomialNB())
def __init__(self, vectors, labels, cores_allocated: int = 1) -> None:
self.model = None
self.cores_allocated = cores_allocated
self.vectors = vectors
self.labels = labels
def use_preconfigured_model(self, preconfigured_model):
self.model = preconfigured_model
class ITSupportVectorClassifier(SKLearnMachineLearningModel):
def __init__(self):
super().__init__(LinearSVC())
def train_model(self):
self.model.fit(self.vectors, self.labels)
def make_predictions(self, items):
return self.model.predict(items)
class ITMultinomialLogisticRegression(ITMachineLearningClassifierImplementation):
def __init__(self, vectors, labels, inverse_regularisation_strength: float, cores_allocated: int = 1):
super().__init__(vectors=vectors, labels=labels, cores_allocated=cores_allocated)
self.model = LogisticRegression(n_jobs=self.cores_allocated,
C=inverse_regularisation_strength,
multi_class='multinomial',
solver='newton-cg',
verbose=1)
class ITMultinomialNaiveBayes(ITMachineLearningClassifierImplementation):
def __init__(self, vectors, labels):
super().__init__(vectors, labels)
self.model = MultinomialNB()
class ITSupportVectorClassifier(ITMachineLearningClassifierImplementation):
def __init__(self, vectors, labels):
super().__init__(vectors, labels)
self.model = LinearSVC()
class ITRandomForestClassifier(ITMachineLearningClassifierImplementation):
def __init__(self, vectors, labels, tree_quantity: int = 200, max_tree_depth: int = 10, randomness: int = 1):
super().__init__(vectors, labels)
RandomForestClassifier(n_estimators=tree_quantity, max_depth=max_tree_depth, random_state=randomness)
class ITRandomForestClassifier(SKLearnMachineLearningModel):
def __init__(self, tree_quantity: int = 200, max_tree_depth: int = 10, randomness: int = 1):
super().__init__(RandomForestClassifier(n_estimators=tree_quantity, max_depth=max_tree_depth, random_state=randomness))
if __name__ == "__main__":
......
......@@ -9,7 +9,8 @@ from project_utilities import evaluators
import pandas
import numba
from custom_models.classifiers import ML_classifiers
import joblib
from project_utilities import model_interaction
@numba.jit(forceobj=1)
......@@ -19,31 +20,33 @@ def preprocess_corpus(corpus: pandas.DataFrame, *columns):
return corpus
class ITSupportTFIDFImplementation:
class TFIDF_Model:
vectorizer = TfidfVectorizer
dataset = pandas.DataFrame
vectorized_descriptions: list
training_descriptions = \
testing_descriptions = \
training_labels = \
testing_labels = numpy.ndarray
def __init__(self, dataset: pandas.DataFrame):
tqdm.pandas(desc="progress-bar")
def __init__(self):
self.vectorizer = TfidfVectorizer(max_features=10000)
self.dataset = dataset
def vectorize_descriptions(self):
self.vectorized_descriptions = self.vectorizer.fit_transform(self.dataset['Description'].values).toarray()
def fit_to_corpus(self, texts):
self.vectorizer.fit(texts)
def from_file(self, filename, model_loader: model_interaction.SKLearnModelFileInteraction):
self.vectorizer = model_loader.load_from_file(filename)
def split_dataset(self, percentage_testing: float):
self.training_descriptions, self.testing_descriptions, self.training_labels, self.testing_labels = \
train_test_split(self.vectorized_descriptions, self.dataset['Priority'].values,
test_size=percentage_testing, random_state=1000)
def to_file(self, filename):
joblib.dump(self.vectorizer, filename)
def vectorize_description(self, description):
return self.vectorizer.transform([description]).toarray()
def Main():
def vectorize_descriptions(self, descriptions):
return self.vectorizer.transform(descriptions).toarray()
@staticmethod
def split_dataset(percentage_testing: float, X, y):
return train_test_split(X, y, test_size=percentage_testing, random_state=1000)
'''def Main():
# Get Dataset
dataset = my_datasets.ITSupportDatasetBuilder() \
.with_summaries_and_descriptions_combined() \
......@@ -51,22 +54,57 @@ def Main():
.with_pre_processed_descriptions() \
.build().corpus
tfidf = ITSupportTFIDFImplementation(dataset)
tfidf.vectorize_descriptions()
tfidf.split_dataset(0.1)
tfidf = ITSupportTFIDFImplementation()
tfidf.fit_to_corpus(dataset['Description'].tolist())
vectorised_descriptions = tfidf.vectorize_descriptions(dataset['Description'].tolist())
# print(tfidf.vectorized_descriptions[0].shape)
X_train, X_test, y_train, y_test = tfidf.split_dataset(0.1, vectorised_descriptions, dataset['Priority'].tolist())
logreg = ML_classifiers.ITMultinomialLogisticRegression(vectors=tfidf.training_descriptions,
labels=tfidf.training_labels,
cores_allocated=-1,
inverse_regularisation_strength=1e5)
print('Training Model')
logreg = ML_classifiers.ITMultinomialLogisticRegression()
logreg.use_preconfigured_model('logreg_model.joblib', model_interaction.SKLearnModelFileInteraction())
# logreg.use_preconfigured_model('logreg_model.joblib', model_interaction.SKLearnModelFileInteraction())
print('Training Model')
logreg.train_model()
label_predictions = logreg.make_predictions(tfidf.testing_descriptions)
joblib.dump(logreg, "logreg_model.joblib")
print("finished!")
# print(X_train, X_test)
# logreg.train_model(X_train, y_train)
# logreg.save_model('logreg_model.joblib', model_interaction.SKLearnModelFileInteraction())
label_predictions = logreg.make_predictions(X_test)
print('Made Predictions') #classification_report(tfidf.testing_labels, label_predictions))
# print('Made Predictions') #classification_report(tfidf.testing_labels, label_predictions))
labels = ['P5', 'P4', 'P3', 'P2', 'P1']
cm = evaluators.ITSupportPriorityConfusionMatrixEvaluator(label_predictions, tfidf.testing_labels, labels)
from sklearn import metrics
# print(metrics.classification_report(y_test, label_predictions))
cm = evaluators.ITSupportPriorityConfusionMatrixEvaluator(label_predictions, y_test, labels)
cm.plot_confusion_matrix(fullscreen_requested=True)
# user_issue = input("Enter ticket desc: ")'''
if __name__ == '__main__':
Main()
# Main()
# Get Dataset
'''dataset = my_datasets.ITSupportDatasetBuilder() \
.with_summaries_and_descriptions_combined() \
.with_overall_priority_column() \
.with_pre_processed_descriptions() \
.build().corpus
tfidf = ITSupportTFIDFImplementation()
tfidf.fit_to_corpus(dataset['Description'].tolist())
tfidf.to_file('tfidf_model.joblib')'''
# Get Dataset
'''dataset = my_datasets.ITSupportDatasetBuilder() \
.with_summaries_and_descriptions_combined() \
.with_overall_priority_column() \
.with_pre_processed_descriptions() \
.build().corpus
tfidf = ITSupportTFIDFImplementation(dataset)
tfidf.vectorize_descriptions()
logreg = joblib.load("logreg_model.joblib")
IT_issue = input("Enter IT issue to be prioritised: ")
preprocessed_input = tfidf.vectorize_description(IT_issue)
label_predictions = logreg.make_predictions(preprocessed_input)
print(label_predictions)'''
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical
from custom_models.classifiers.DL_classifiers import KerasCNN
from custom_models.feature_selection_extraction.algorithmic_feature_extraction_selection import TFIDF_Model
from project_utilities.evaluators import DetailedConfusionMatrix, AccuracyPerClass
from project_utilities.model_interaction import SKLearnModelFileInteraction, KerasModelFileInteraction
from project_utilities.my_datasets import ITSupportDatasetBuilder
from project_utilities import my_datasets, evaluators
from custom_models.feature_selection_extraction import ML_DL_feature_extraction_selection, algorithmic_feature_extraction_selection
dataset = ITSupportDatasetBuilder() \
.with_summaries_and_descriptions_combined() \
.with_overall_priority_column() \
.with_pre_processed_descriptions() \
.build().corpus
TFIDF_model = TFIDF_Model()
TFIDF_model.from_file('custom_models/feature_selection_extraction/tfidf_model.joblib', SKLearnModelFileInteraction())
if __name__ == '__main__':
algorithmic_feature_extraction_selection.Main()
CNN_model = KerasCNN()
CNN_model.from_file('custom_models/classifiers/CNN_model.h5', KerasModelFileInteraction())
vectorised_descriptions = TFIDF_model.vectorize_descriptions(dataset['Description'].tolist())
X_train, X_test, y_train, y_test = TFIDF_model.split_dataset(0.1, vectorised_descriptions,
dataset['Priority'].tolist())
#vectorized_desc = TFIDF_Model.vectorize_description(self=TFIDF_model, description="WIFI network has lost connction across the whole campus, this needs fixing ASAP")
encoder = LabelEncoder().fit(['P5', 'P4', 'P3', 'P2', 'P1'])
'''y_train = to_categorical(encoder.transform(y_train))
y_val = to_categorical(encoder.transform(y_test))'''
encoded_predictions = CNN_model.make_predictions(X_test)
decoded_predictions = encoder.inverse_transform(encoded_predictions.argmax(axis=1))
confusion_matrix = DetailedConfusionMatrix(decoded_predictions, y_test, ['P5', 'P4', 'P3', 'P2', 'P1'])
confusion_matrix.plot_confusion_matrix(fullscreen_requested=True)
apc = AccuracyPerClass(decoded_predictions, y_test, ['P5', 'P4', 'P3', 'P2', 'P1'])
apc.plot_confusion_matrix()
# See PyCharm help at https://www.jetbrains.com/help/pycharm/
from abc import ABC, abstractmethod
from project_utilities import model_interaction
from keras.models import load_model
class SKLearnMachineLearningModel(ABC):
cores_allocated: int
@abstractmethod
def __init__(self, model=None):
self.model = model
def use_preconfigured_model(self, filename, model_loader: model_interaction.SKLearnModelFileInteraction):
self.model = model_loader.load_from_file(filename)
def save_model(self, filename, model_loader: model_interaction.SKLearnModelFileInteraction):
model_loader.load_to_file(self.model, filename)
def train_model(self, vectors, labels):
self.model.fit(vectors, labels)
def make_predictions(self, items):
return self.model.predict(items)
class KerasDeepLearningModel(ABC):
@abstractmethod
def __init__(self, model=None):
self.model = model
def from_file(self, filename, model_loader: model_interaction.KerasModelFileInteraction):
self.model = model_loader.load_from_file(filename)
def to_file(self, filename, model_loader: model_interaction.KerasModelFileInteraction):
model_loader.load_to_file(self.model, filename)
def add_model_config(self, layer):
self.model.add(layer)
def compile_model(self, loss_function, optimizer, *metrics):
self.model.compile(loss=loss_function, metrics=[*metrics, ], optimizer=optimizer)
@abstractmethod
def train_model(self, vectors, labels, test_vectors, test_labels, epochs, batch_size):
pass
@abstractmethod
def make_predictions(self, vectors):
pass
from sklearn.metrics import confusion_matrix
from seaborn import heatmap
from matplotlib.pyplot import show, subplots, get_current_fig_manager
import matplotlib.pyplot as plt
from pandas import DataFrame
from numpy import sum as numpy_sum, ndarray, empty_like
from dataclasses import dataclass
global CURRENT_FIGURES
class ITSupportPriorityConfusionMatrixEvaluator:
class DetailedConfusionMatrix:
"""Class for storing and showing a confusion matrix.
Adapted from https://www.kaggle.com/code/agungor2/various-confusion-matrix-plots/notebook"""
......@@ -36,7 +39,7 @@ class ITSupportPriorityConfusionMatrixEvaluator:
dataset_confusion_matrix_data_frame.index.name = 'Actual'
dataset_confusion_matrix_data_frame.columns.name = 'Predicted'
label_quantity = len(self.labels)
fig, ax = subplots(figsize=(label_quantity, label_quantity))
fig, ax = plt.subplots(figsize=(label_quantity, label_quantity))
# Adapted from https://stackoverflow.com/questions/42111075/seaborn-heatmap-color-scheme-based-on-row-values
normalised_confusion_matrix = dataset_confusion_matrix_data_frame.div(
......@@ -46,10 +49,10 @@ class ITSupportPriorityConfusionMatrixEvaluator:
# Adapted from https://stackoverflow.com/questions/12439588/how-to-maximize-a-plt-show-window-using-python
# (dinvlad)
if fullscreen_requested:
fig_manager = get_current_fig_manager()
fig_manager = plt.get_current_fig_manager()
fig_manager.window.state('zoomed')
show()
plt.show()
def __update_dataset_annotations(self):
n_rows, n_columns = self.dataset_confusion_matrix.shape
......@@ -65,4 +68,56 @@ class ITSupportPriorityConfusionMatrixEvaluator:
cell_percentage_of_category, cell_predicted_count, category_count)
else:
self.dataset_annotations[row, column] = '%d%%\n%d/%d' % (0, 0, category_count)
@dataclass
class AccuracyPerClass:
label_predictions: list
actual_labels: list
label_classes: list
@property
def confusion_matrix(self):
# Get the confusion matrix
cm = confusion_matrix(self.actual_labels, self.label_predictions)
return cm
def sort_to_correct_incorrect_predictions(self):
correct_predictions = {'P5': 0, 'P4': 0, 'P3': 0, 'P2': 0, 'P1': 0}
incorrect_predictions = {'P5': 0, 'P4': 0, 'P3': 0, 'P2': 0, 'P1': 0}
for predicted, actual in zip(self.label_predictions, self.actual_labels):
if predicted == actual:
correct_predictions[actual] += 1
else:
incorrect_predictions[actual] += 1
return list(correct_predictions.values()), list(incorrect_predictions.values())
def normalise_correct_incorrect(self, correct, incorrect):
total_predictions_per_label = [correct[label] + incorrect[label] for label in range(len(correct))]
normalised_correct_predictions = [correct[label] / total_predictions_per_label[label] for label in range(len(correct))]
normalised_incorrect_predictions = [incorrect[label] / total_predictions_per_label[label] for label in range(len(incorrect))]
return normalised_correct_predictions, normalised_incorrect_predictions
def plot_confusion_matrix(self):
"""
Adapted from firstly phind prompts:
1. How do you directly compare two confusion matrices
2. Generate a python script that shows percentage accuracy of 5 different classes
3. Can you generate the code to plot this with matplotlib
4. Make the script plot the percentage accuracy of each class
Then Bing AI GPT Prompts:
1. generate a python function that plots correct and incorrect for a specified number of classes
2. could you normalize each class so the bars are equal
:return: None
"""
# Create bar plot
labels = [f'{self.label_classes[i]}' for i in range(len(self.label_classes))]
correct, incorrect = self.sort_to_correct_incorrect_predictions()
normalised_correct, normalised_incorrect = self.normalise_correct_incorrect(correct, incorrect)
width = 0.35
fig, ax = plt.subplots()
ax.bar(labels, normalised_correct, width, label='Correct')
ax.bar(labels, normalised_incorrect, width, bottom=normalised_correct, label='Incorrect')
ax.set_ylabel('Correct - Incorrect Proportion')
ax.legend()
plt.show()
from abc import ABC, abstractmethod
import joblib
from keras.models import load_model, save_model
class ModelFileInteraction(ABC):
@staticmethod
@abstractmethod
def load_from_file(filename):
pass
@staticmethod
@abstractmethod
def load_to_file(model, filename):
pass
class SKLearnModelFileInteraction(ModelFileInteraction):
@staticmethod
def load_from_file(filename):
return joblib.load(filename)
@staticmethod
def load_to_file(model, filename):
joblib.dump(model, filename)
class KerasModelFileInteraction(ModelFileInteraction):
@staticmethod
def load_from_file(filename):
return load_model(filename)
@staticmethod
def load_to_file(model, filename):
model.save(filename)
from pandas import read_csv, read_pickle, DataFrame, concat
from pandas import read_csv, DataFrame, concat
from dataclasses import dataclass
import preprocessing_functionality
'''@dataclass
class ITSupportDataset:
"""Class for storing the IT Support Ticket Descriptions, Impacts, Urgencies, and Overall Priority"""
corpus = DataFrame
raw_dataset = DataFrame
def __init__(self, combined_title_description_requested: bool = False):
self.__get_raw_dataset()
self.__get_dataset(combined_title_description_requested)
self.__add_overall_priority_column()
def __get_raw_dataset(self):
self.raw_dataset = read_csv('C:\\Users\\Benjamin\\PycharmProjects\\DISSERTATION_ARTEFACT\\project_utilities'
'\\Datasets\\ITSupport_Tickets.csv')
#ticket_data_high_prio = read_csv('C:\\Users\\Benjamin\\PycharmProjects\\DISSERTATION_ARTEFACT\\project_utilities'
#'\\Datasets\\ITSupport_Tickets_High_Prio.csv')
#self.raw_dataset = ticket_data_low_prio
def __get_dataset(self, combined_title_description_requested: bool):
impacts = self.raw_dataset['Impact'].tolist()
urgencies = self.raw_dataset['Urgency'].tolist()
texts = self.raw_dataset['Description'].tolist()
if combined_title_description_requested:
summaries = self.raw_dataset['Incident_Summary'].tolist()
non_nulled_dataset = self.__remove_nulls_with_summaries(impacts, urgencies, texts, summaries)
else:
non_nulled_dataset = self.__remove_nulls(impacts, urgencies, texts)
self.corpus = DataFrame(non_nulled_dataset)
def __remove_nulls(self, impacts, urgencies, descriptions):
dict_corpus = {'Descriptions': [], 'Impacts': [], 'Urgencies': []}
for index in range(len(impacts)):
if not (impacts[index] is np.nan
or urgencies[index] is np.nan
or descriptions[index] is np.nan):
dict_corpus['Descriptions'].append(descriptions[index])
dict_corpus['Impacts'].append(impacts[index])
dict_corpus['Urgencies'].append(urgencies[index])
return dict_corpus
def __remove_nulls_with_summaries(self, impacts, urgencies, descriptions, summaries):
dict_corpus = {'Descriptions': [], 'Impacts': [], 'Urgencies': []}
for index in range(len(impacts)):
if not (impacts[index] is np.nan
or urgencies[index] is np.nan
or descriptions[index] is np.nan):
dict_corpus['Descriptions'].append(str(summaries[index]) + ' ' + str(descriptions[index]))
dict_corpus['Impacts'].append(impacts[index])
dict_corpus['Urgencies'].append(urgencies[index])
return dict_corpus
def __add_overall_priority_column(self):
prio_to_num = {'Low': 0, 'Medium': 1, 'High': 2}
num_to_pnum = ['P5', 'P4', 'P3', 'P2', 'P1']
pnums = []
for priorities in zip(self.corpus['Impacts'], self.corpus['Urgencies']):
numbered_priority = sum([prio_to_num[priorities[0]], prio_to_num[priorities[1]]])
pnums.append(num_to_pnum[numbered_priority])
self.corpus['Priorities'] = pnums'''
@dataclass
class ITSupportDatasetWithBuilder:
......@@ -81,12 +15,13 @@ class ITSupportDatasetWithBuilder:
def __init__(self):
self.__get_raw_dataset()
self.__remove_nulls()
#self.corpus = self.corpus.reset_index().drop_duplicates(subset='index', keep='first').set_index('index')
def __get_raw_dataset(self):
ticket_data_low_prio = read_csv('/\\project_utilities'
'\\Datasets\\ITSupport_Tickets.csv')
ticket_data_high_prio = read_csv('/\\project_utilities'
'\\Datasets\\ITSupport_Tickets_High_Prio.csv')
ticket_data_low_prio = read_csv("C:\\Users\\Benjamin\\PycharmProjects\\DISSERTATION_ARTEFACT"
"\\project_utilities\\Datasets\\ITSupport_Tickets.csv")
ticket_data_high_prio = read_csv("C:\\Users\\Benjamin\\PycharmProjects\\DISSERTATION_ARTEFACT"
"\\project_utilities\\Datasets\\ITSupport_Tickets_High_Prio.csv")
self.corpus = concat([ticket_data_low_prio, ticket_data_high_prio])
def combine_summaries_with_descriptions(self):
......@@ -143,6 +78,71 @@ class ITSupportDatasetBuilder(object):
return self._dataset
'''@dataclass
class ITSupportDataset:
"""Class for storing the IT Support Ticket Descriptions, Impacts, Urgencies, and Overall Priority"""
corpus = DataFrame
raw_dataset = DataFrame
def __init__(self, combined_title_description_requested: bool = False):
self.__get_raw_dataset()
self.__get_dataset(combined_title_description_requested)
self.__add_overall_priority_column()
def __get_raw_dataset(self):
self.raw_dataset = read_csv('C:\\Users\\Benjamin\\PycharmProjects\\DISSERTATION_ARTEFACT\\project_utilities'
'\\Datasets\\ITSupport_Tickets.csv')
#ticket_data_high_prio = read_csv('C:\\Users\\Benjamin\\PycharmProjects\\DISSERTATION_ARTEFACT\\project_utilities'
#'\\Datasets\\ITSupport_Tickets_High_Prio.csv')
#self.raw_dataset = ticket_data_low_prio
def __get_dataset(self, combined_title_description_requested: bool):
impacts = self.raw_dataset['Impact'].tolist()
urgencies = self.raw_dataset['Urgency'].tolist()
texts = self.raw_dataset['Description'].tolist()
if combined_title_description_requested:
summaries = self.raw_dataset['Incident_Summary'].tolist()
non_nulled_dataset = self.__remove_nulls_with_summaries(impacts, urgencies, texts, summaries)
else:
non_nulled_dataset = self.__remove_nulls(impacts, urgencies, texts)
self.corpus = DataFrame(non_nulled_dataset)
def __remove_nulls(self, impacts, urgencies, descriptions):
dict_corpus = {'Descriptions': [], 'Impacts': [], 'Urgencies': []}
for index in range(len(impacts)):
if not (impacts[index] is np.nan
or urgencies[index] is np.nan
or descriptions[index] is np.nan):
dict_corpus['Descriptions'].append(descriptions[index])
dict_corpus['Impacts'].append(impacts[index])
dict_corpus['Urgencies'].append(urgencies[index])
return dict_corpus
def __remove_nulls_with_summaries(self, impacts, urgencies, descriptions, summaries):
dict_corpus = {'Descriptions': [], 'Impacts': [], 'Urgencies': []}
for index in range(len(impacts)):
if not (impacts[index] is np.nan
or urgencies[index] is np.nan
or descriptions[index] is np.nan):
dict_corpus['Descriptions'].append(str(summaries[index]) + ' ' + str(descriptions[index]))
dict_corpus['Impacts'].append(impacts[index])
dict_corpus['Urgencies'].append(urgencies[index])
return dict_corpus
def __add_overall_priority_column(self):
prio_to_num = {'Low': 0, 'Medium': 1, 'High': 2}
num_to_pnum = ['P5', 'P4', 'P3', 'P2', 'P1']
pnums = []
for priorities in zip(self.corpus['Impacts'], self.corpus['Urgencies']):
numbered_priority = sum([prio_to_num[priorities[0]], prio_to_num[priorities[1]]])
pnums.append(num_to_pnum[numbered_priority])
self.corpus['Priorities'] = pnums'''
'''
#Previous method, more efficient, way more lines though
impacts = self.raw_dataset['Impact'].tolist()
......@@ -189,3 +189,6 @@ if __name__ == '__main__':
corpus.to_pickle('corpus.pickle')'''
dataset = ITSupportDatasetBuilder().with_summaries_and_descriptions_combined().with_overall_priority_column().build()
print(dataset.corpus.shape)
dataset.corpus = dataset.corpus.reset_index().drop_duplicates(subset='index', keep='first').set_index('index')
print(dataset.corpus.shape)
print(dataset.corpus.loc[1])
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment