Commit 1d16a38d authored by Benjamin's avatar Benjamin

almost done

parent 4c72cb41
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical
import custom_models.classifiers.ML_classifiers
from custom_models.classifiers.DL_classifiers import KerasCNN
from custom_models.feature_selection_extraction.algorithmic_feature_extraction_selection import TFIDF_Model
from custom_models.feature_selection_extraction.ML_DL_feature_extraction_selection import ITSupportDoc2VecImplementation, Doc2VecModels
from project_utilities.evaluators import DetailedConfusionMatrix, AccuracyPerClass
from project_utilities.model_interaction import SKLearnModelFileInteraction, KerasModelFileInteraction, GensimWordEmbeddingModelFileInteraction
from project_utilities import predictionformats
from project_utilities.my_datasets import ITSupportDatasetBuilder
from projectsettings import DefaultConfig
import pandas as pd
# Load Dataset
dataset = ITSupportDatasetBuilder(f"{DefaultConfig.absolute_project_root_path()}/project_utilities/Datasets/ITSupport_Tickets.csv",
f"{DefaultConfig.absolute_project_root_path()}/project_utilities/Datasets/ITSupport_Tickets_High_Prio.csv",
f"{DefaultConfig.absolute_project_root_path()}/project_utilities/Datasets/synonym_IT_tickets.csv") \
.with_summaries_and_descriptions_combined() \
.with_overall_priority_column() \
.with_pre_processed_descriptions() \
.build().corpus
# Split dataset into test and train
X_train_str, X_test_str, y_train, y_test = TFIDF_Model.split_dataset(0.1, dataset['Description'].tolist(),
dataset['Priority'].tolist())
# Get pre-configured doc2vec model
doc2vec_model = ITSupportDoc2VecImplementation(Doc2VecModels.DBOW)
'''doc2vec_model.from_file(
f"{DefaultConfig.absolute_project_root_path()}/custom_models/preconfigured_models/doc2vec_model.model",
GensimWordEmbeddingModelFileInteraction())'''
tagged_training_documents = doc2vec_model.tag_documents(pd.DataFrame({'Description': X_train_str, 'Priority': y_train}))
tagged_testing_documents = doc2vec_model.tag_documents(pd.DataFrame({'Description': X_test_str, 'Priority': y_test}))
doc2vec_model.build_vocabulary(tagged_training_documents)
doc2vec_model.train_model(tagged_training_documents, dataset_shuffles=10, epochs=10)
#doc2vec_model.to_file("doc2vec_model.model", model_interaction.GensimWordEmbeddingModelFileInteraction())
#tagged_descriptions = doc2vec_model.tag_documents(X_test_str)
X_train = doc2vec_model.vectorize_documents(X_train_str)
X_test = doc2vec_model.vectorize_documents(X_test_str)
# Load Logistic Regression model
logreg_model = custom_models.classifiers.ML_classifiers.ITMultinomialLogisticRegression(cores_allocated=1)
'''logreg_model.use_preconfigured_model(
f'{DefaultConfig.absolute_project_root_path()}/custom_models/preconfigured_models/doc2vec_to_logreg_model.joblib',
SKLearnModelFileInteraction())'''
logreg_model.train_model(vectors=X_train, labels=y_train)
#logreg_model.save_model('doc2vec_to_logreg_model.joblib', SKLearnModelFileInteraction())
# Make predictions
predictions = logreg_model.make_predictions(X_test)
# Represent accuracies
confusion_matrix = DetailedConfusionMatrix(predictions, y_test, ['P5', 'P4', 'P3', 'P2', 'P1'])
confusion_matrix.plot_confusion_matrix(fullscreen_requested=True)
apc = AccuracyPerClass(predictions, y_test, ['P5', 'P4', 'P3', 'P2', 'P1'])
apc.plot_confusion_matrix()
......@@ -11,22 +11,25 @@ from projectsettings import DefaultConfig
import pandas as pd
# Load Dataset
dataset = ITSupportDatasetBuilder(f"{DefaultConfig.absolute_project_root_path()}/project_utilities/Datasets/ITSupport_Tickets.csv",
f"{DefaultConfig.absolute_project_root_path()}/project_utilities/Datasets/ITSupport_Tickets_High_Prio.csv") \
dataset = ITSupportDatasetBuilder(
f"{DefaultConfig.absolute_project_root_path()}/project_utilities/Datasets/ITSupport_Tickets.csv",
f"{DefaultConfig.absolute_project_root_path()}/project_utilities/Datasets/ITSupport_Tickets_High_Prio.csv",
f"{DefaultConfig.absolute_project_root_path()}/project_utilities/Datasets/synonym_IT_tickets.csv") \
.with_summaries_and_descriptions_combined() \
.with_overall_priority_column() \
.with_pre_processed_descriptions() \
.build().corpus
# Load Pre-configured TF-IDF
TFIDF_model = TFIDF_Model()
TFIDF_model.from_file(
f'{DefaultConfig.absolute_project_root_path()}/custom_models/preconfigured_models/tfidf_model.joblib',
f'{DefaultConfig.absolute_project_root_path()}/custom_models/preconfigured_models/tfidf_larger_model.joblib',#_larger_model.joblib',
SKLearnModelFileInteraction())
# Load Pre-configured Keras CNN
CNN_model = KerasCNN()
CNN_model.from_file(f'{DefaultConfig.absolute_project_root_path()}/custom_models/preconfigured_models/CNN_model.h5',
CNN_model.from_file(f'{DefaultConfig.absolute_project_root_path()}/custom_models/preconfigured_models/CNN_model_deeper.h5',
KerasModelFileInteraction())
# Split dataset into test and train
......@@ -47,16 +50,16 @@ encoded_predictions = CNN_model.make_predictions(X_test)
decoded_predictions = encoder.inverse_transform(encoded_predictions.argmax(axis=1))
# Represent accuracies
'''confusion_matrix = DetailedConfusionMatrix(decoded_predictions, y_test, ['P5', 'P4', 'P3', 'P2', 'P1'])
confusion_matrix = DetailedConfusionMatrix(decoded_predictions, y_test, ['P5', 'P4', 'P3', 'P2', 'P1'])
confusion_matrix.plot_confusion_matrix(fullscreen_requested=True)
apc = AccuracyPerClass(decoded_predictions, y_test, ['P5', 'P4', 'P3', 'P2', 'P1'])
apc.plot_confusion_matrix()'''
apc.plot_confusion_matrix()
# export predictions to file
dict_descriptions_predictions = {'Description': X_test_str, 'PredictedPriority': decoded_predictions}
'''dict_descriptions_predictions = {'Description': X_test_str, 'PredictedPriority': decoded_predictions}
formatted_predictions = pd.DataFrame(dict_descriptions_predictions)
prediction_saver = predictionformats.ITSupportPredictionFormat()
prediction_saver.load_predictions(formatted_predictions)
filename = input("Enter filename: ")
prediction_saver.save_predictions_to_file(filename, 'csv')
\ No newline at end of file
prediction_saver.save_predictions_to_file(filename, 'csv')'''
\ No newline at end of file
......@@ -21,11 +21,11 @@ dataset = ITSupportDatasetBuilder(dataset_file_loc) \
TFIDF_model = TFIDF_Model()
TFIDF_model.from_file(
f'{DefaultConfig.absolute_project_root_path()}/custom_models/preconfigured_models/tfidf_model.joblib',
f'{DefaultConfig.absolute_project_root_path()}/custom_models/preconfigured_models/tfidf_larger_model.joblib',
SKLearnModelFileInteraction())
CNN_model = KerasCNN()
CNN_model.from_file(f'{DefaultConfig.absolute_project_root_path()}/custom_models/preconfigured_models/CNN_model.h5',
CNN_model.from_file(f'{DefaultConfig.absolute_project_root_path()}/custom_models/preconfigured_models/CNN_model_deeper.h5',
KerasModelFileInteraction())
#X_train_str, X_test_str, y_train, y_test = TFIDF_model.split_dataset(0.1, dataset['Description'].tolist(),
......
......@@ -9,12 +9,12 @@ import sys
# Load Pre-configured TF-IDF
TFIDF_model = TFIDF_Model()
TFIDF_model.from_file(
f'{DefaultConfig.absolute_project_root_path()}/custom_models/preconfigured_models/tfidf_model.joblib',
f'{DefaultConfig.absolute_project_root_path()}/custom_models/preconfigured_models/tfidf_larger_model.joblib',
SKLearnModelFileInteraction())
# Load Pre-configured Keras CNN
CNN_model = KerasCNN()
CNN_model.from_file(f'{DefaultConfig.absolute_project_root_path()}/custom_models/preconfigured_models/CNN_model.h5',
CNN_model.from_file(f'{DefaultConfig.absolute_project_root_path()}/custom_models/preconfigured_models/CNN_model_deeper.h5',
KerasModelFileInteraction())
# Convert P1-5 into categories the model understands
......
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical
from custom_models.classifiers.DL_classifiers import KerasCNN
from custom_models.classifiers.ML_classifiers import ITMultinomialLogisticRegression
from custom_models.feature_selection_extraction.algorithmic_feature_extraction_selection import TFIDF_Model
from project_utilities.evaluators import DetailedConfusionMatrix, AccuracyPerClass
from project_utilities.model_interaction import SKLearnModelFileInteraction, KerasModelFileInteraction
from project_utilities import predictionformats
from project_utilities.my_datasets import ITSupportDatasetBuilder
from projectsettings import DefaultConfig
import pandas as pd
# Load Dataset
dataset = ITSupportDatasetBuilder(
f"{DefaultConfig.absolute_project_root_path()}/project_utilities/Datasets/ITSupport_Tickets.csv",
f"{DefaultConfig.absolute_project_root_path()}/project_utilities/Datasets/ITSupport_Tickets_High_Prio.csv",
f"{DefaultConfig.absolute_project_root_path()}/project_utilities/Datasets/synonym_IT_tickets.csv") \
.with_summaries_and_descriptions_combined() \
.with_overall_priority_column() \
.with_pre_processed_descriptions() \
.build().corpus
# Load Pre-configured TF-IDF
TFIDF_model = TFIDF_Model()
TFIDF_model.from_file(
f'{DefaultConfig.absolute_project_root_path()}/custom_models/preconfigured_models/tfidf_larger_model.joblib',
SKLearnModelFileInteraction())
# Load Logistic Regression model
logreg_model = ITMultinomialLogisticRegression(cores_allocated=1)
logreg_model.use_preconfigured_model(
f'{DefaultConfig.absolute_project_root_path()}/custom_models/preconfigured_models/tfidf_to_logreg_model_larger.joblib',
SKLearnModelFileInteraction())
# Split dataset into test and train
X_train_str, X_test_str, y_train, y_test = TFIDF_model.split_dataset(0.1, dataset['Description'].tolist(),
dataset['Priority'].tolist())
# Convert the Descriptions to Sparse Matrices, representative of text
X_test = TFIDF_model.vectorize_descriptions(X_test_str)
#X_train = TFIDF_model.vectorize_descriptions(X_train_str)
#logreg_model.train_model(X_train, y_train)
# vectorized_desc = TFIDF_Model.vectorize_description(self=TFIDF_model, description="WIFI network has lost connction across the whole campus, this needs fixing ASAP")
#encoder = LabelEncoder().fit(['P5', 'P4', 'P3', 'P2', 'P1'])
'''y_train = to_categorical(encoder.transform(y_train))
y_val = to_categorical(encoder.transform(y_test))'''
# Make predictions
predictions = logreg_model.make_predictions(X_test)
#decoded_predictions = encoder.inverse_transform(encoded_predictions.argmax(axis=1))
# Represent accuracies
confusion_matrix = DetailedConfusionMatrix(predictions, y_test, ['P5', 'P4', 'P3', 'P2', 'P1'])
confusion_matrix.plot_confusion_matrix(fullscreen_requested=True)
apc = AccuracyPerClass(predictions, y_test, ['P5', 'P4', 'P3', 'P2', 'P1'])
apc.plot_confusion_matrix()
# export predictions to file
'''dict_descriptions_predictions = {'Description': X_test_str, 'PredictedPriority': decoded_predictions}
formatted_predictions = pd.DataFrame(dict_descriptions_predictions)
prediction_saver = predictionformats.ITSupportPredictionFormat()
prediction_saver.load_predictions(formatted_predictions)
filename = input("Enter filename: ")
prediction_saver.save_predictions_to_file(filename, 'csv')'''
......@@ -3,15 +3,18 @@ from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from project_utilities.ModelTemplates import SKLearnMachineLearningModel
from custom_models.feature_selection_extraction.algorithmic_feature_extraction_selection import TFIDF_Model
from projectsettings import DefaultConfig
from project_utilities.model_interaction import SKLearnModelFileInteraction
from project_utilities import my_datasets
class ITMultinomialLogisticRegression(SKLearnMachineLearningModel):
def __init__(self, inverse_regularisation_strength: float = 1e5, cores_allocated: int = 1):
super().__init__(LogisticRegression(n_jobs=cores_allocated,
C=inverse_regularisation_strength,
multi_class='multinomial',
solver='newton-cg',
verbose=1))
verbose=1,
max_iter=10000))
class ITMultinomialNaiveBayes(SKLearnMachineLearningModel):
......@@ -30,5 +33,24 @@ class ITRandomForestClassifier(SKLearnMachineLearningModel):
if __name__ == "__main__":
# logreg = ITMultinomialLogisticRegression(6, 1e5)
pass
# Get Dataset
dataset = my_datasets.ITSupportDatasetBuilder(
f"{DefaultConfig.absolute_project_root_path()}/project_utilities/Datasets/ITSupport_Tickets.csv",
f"{DefaultConfig.absolute_project_root_path()}/project_utilities/Datasets/ITSupport_Tickets_High_Prio.csv",
f"{DefaultConfig.absolute_project_root_path()}/project_utilities/Datasets/synonym_IT_tickets.csv") \
.with_summaries_and_descriptions_combined() \
.with_overall_priority_column() \
.with_pre_processed_descriptions() \
.build().corpus
logreg = ITMultinomialLogisticRegression(1e5, 6)
tfidf = TFIDF_Model()
tfidf.from_file(f"{DefaultConfig.absolute_project_root_path()}/custom_models/preconfigured_models/tfidf_larger_model.joblib", SKLearnModelFileInteraction())
X = dataset['Description'].tolist()
y = dataset['Priority'].tolist()
X_train_str, X_test_str, y_train, y_test = tfidf.split_dataset(0.1, X, y)
X_train = tfidf.vectorize_descriptions(X_train_str)
X_test = tfidf.vectorize_descriptions(X_test_str)
logreg.train_model(X_train, y_train)
logreg.save_model('tfidf_to_logreg_model.joblib', SKLearnModelFileInteraction())
......@@ -9,121 +9,107 @@ from pandas import DataFrame
from sklearn import utils
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from project_utilities import model_interaction
from project_utilities.ModelTemplates import GensimWordEmbeddingModel
from gensim.models.doc2vec import Doc2Vec
# from project_utilities import preprocessing_functionality, my_datasets
from project_utilities import my_datasets
import preprocessing_functionality
class Doc2VecModels(Enum):
DBOW = 1
DM = 2
COMBINED = 3
class ITSupportDoc2VecImplementation:
dataset = DataFrame
tagged_training_documents = DataFrame
tagged_testing_documents = DataFrame
class ITSupportDoc2VecImplementation(GensimWordEmbeddingModel):
model_type = Doc2VecModels
model = gensim.models.Doc2Vec
train_descriptions = \
test_descriptions = \
train_labels = \
test_labels = tuple
def __init__(self, dataset, model_type):
self.dataset = dataset
def __init__(self, model_type, alpha_change=-.002):
self.model_type = model_type
self.alpha_change = None
model = self.create_model()
self.alpha_change = alpha_change
tqdm.pandas(desc="progress-bar")
super().__init__(model)
def split_texts(self):
training_data, testing_data = train_test_split(self.dataset, test_size=0.1, random_state=1000)
@staticmethod
def split_texts(dataset):
training_data, testing_data = train_test_split(dataset, test_size=0.1, random_state=1000)
return training_data, testing_data
def tag_documents(self):
training_documents, testing_documents = self.split_texts()
self.tagged_training_documents = training_documents.apply(
lambda docs: gensim.models.doc2vec.TaggedDocument(
words=preprocessing_functionality.tokenize_text(docs.Description),
tags=[docs.Priority]),
axis=1)
self.tagged_testing_documents = testing_documents.apply(
def tag_documents(self, documents) -> DataFrame:
tagged_documents = documents.apply(
lambda docs: gensim.models.doc2vec.TaggedDocument(
words=preprocessing_functionality.tokenize_text(docs.Description),
tags=[docs.Priority]),
axis=1)
return tagged_documents
def create_model(self):
cores = multiprocessing.cpu_count()
match self.model_type:
case Doc2VecModels.DBOW:
self._create_dbow_model(cores)
return self._create_dbow_model(cores)
case Doc2VecModels.DM:
self._create_dm_model(cores)
case Doc2VecModels.COMBINED:
self._create_combined_model(cores)
return self._create_dm_model(cores)
case _:
raise TypeError("Must be a Doc2Vec model type (DBOW, DM, COMBINED)")
def _create_dbow_model(self, cores):
self.model = gensim.models.Doc2Vec(
model = Doc2Vec(
dm=0, vector_size=1000, negative=5, hs=0, min_count=2, sample=0, workers=cores)
self.alpha_change = 0.0002
return model
def _create_dm_model(self, cores):
self.model = gensim.models.Doc2Vec(
model = Doc2Vec(
dm=1, dm_mean=1, vector_size=300, window=10, negative=5,
min_count=1, workers=cores, alpha=0.065, min_alpha=0.065)
self.alpha_change = -0.002
return model
def _create_combined_model(self, cores):
dbow_model = gensim.models.Doc2Vec(
dm=0, vector_size=300, negative=5, hs=0, min_count=2, sample=0, workers=cores)
dm_model = gensim.models.Doc2Vec(
dm=1, dm_mean=1, vector_size=300, window=10, negative=5,
min_count=1, workers=cores, alpha=0.065, min_alpha=0.065)
self.model = ConcatenatedDoc2Vec([dbow_model, dm_model])
def build_vocabulary(self):
vocabulary = [x for x in tqdm(self.tagged_training_documents.values)]
def build_vocabulary(self, tagged_training_documents):
vocabulary = [x for x in tqdm(tagged_training_documents.values)]
self.model.build_vocab(vocabulary)
def train_model(self, dataset_shuffles: int = 1, epochs: int = 1):
def train_model(self, tagged_training_documents, dataset_shuffles: int = 1, epochs: int = 1):
for training_round in range(dataset_shuffles):
shuffled_training_data = utils.shuffle([x for x in tqdm(self.tagged_training_documents.values)])
datapoint_quantity = len(self.tagged_training_documents)
self.model.train(shuffled_training_data, total_examples=datapoint_quantity,
epochs=epochs)
# shuffle training data
shuffled_training_data = utils.shuffle([x for x in tqdm(tagged_training_documents.values)])
dataset_size = len(tagged_training_documents)
self.model.train(shuffled_training_data, total_examples=dataset_size,epochs=epochs)
self.model.alpha += self.alpha_change
self.model.min_alpha = self.model.alpha
#@numba.jit(forceobj=True)
# @numba.jit(forceobj=True)
def vectorize_tagged_documents(self, tagged_documents):
sentences = tagged_documents.values
targets, regressors = zip(*[(doc.tags[0], self.model.infer_vector(doc.words)) for doc in sentences])
return targets, regressors
def generate_vectors(self):
self.train_labels, self.train_descriptions = self.vectorize_tagged_documents(self.tagged_training_documents)
self.test_labels, self.test_descriptions = self.vectorize_tagged_documents(self.tagged_testing_documents)
def generate_training_vectors(self, tagged_documents):
labels, descriptions = self.vectorize_tagged_documents(tagged_documents)
return labels, descriptions
def vectorize_documents(self, documents):
documents = [document.split(' ') for document in documents]
return [self.model.infer_vector(document) for document in documents]
if __name__ == '__main__':
'''dataset = my_datasets.ITSupportDatasetBuilder()\
dataset = my_datasets.ITSupportDatasetBuilder()\
.with_overall_priority_column()\
.with_summaries_and_descriptions_combined()\
.with_pre_processed_descriptions()\
.build()
doc2vec_IT = ITSupportDoc2VecImplementation(dataset=dataset.corpus, model_type=Doc2VecModels.DM)
#doc2vec_IT.pre_process_texts()
doc2vec_IT.tag_documents()
doc2vec_IT.create_model()
t1 = time.perf_counter()
doc2vec_IT.build_vocabulary()
doc2vec_IT.train_model(dataset_shuffles=1, epochs=1)
print("time: " + str(time.perf_counter() - t1))
doc2vec_IT.generate_vectors()
print(doc2vec_IT.tagged_training_documents[50])
#print(doc2vec_IT.X_test)'''
.build().corpus
doc2vec_IT = ITSupportDoc2VecImplementation(model_type=Doc2VecModels.DBOW, alpha_change=-0.002)
training_documents, testing_documents = doc2vec_IT.split_texts(dataset)
tagged_training_documents = doc2vec_IT.tag_documents(training_documents)
tagged_testing_documents = doc2vec_IT.tag_documents(testing_documents)
doc2vec_IT.build_vocabulary(tagged_training_documents)
doc2vec_IT.train_model(tagged_training_documents, dataset_shuffles=3, epochs=10)
doc2vec_IT.to_file("doc2vec_model.model", model_interaction.GensimWordEmbeddingModelFileInteraction())
#doc2vec_IT.generate_vectors()
#print(doc2vec_IT.X_test)
import numpy
from tqdm import tqdm
import projectsettings
from project_utilities import my_datasets, preprocessing_functionality
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
......@@ -8,9 +10,10 @@ from sklearn.metrics import classification_report
from project_utilities import evaluators
import pandas
import numba
from custom_models.classifiers import ML_classifiers
#from custom_models.classifiers import ML_classifiers
import joblib
from project_utilities import model_interaction
from projectsettings import DefaultConfig
@numba.jit(forceobj=1)
......@@ -26,18 +29,23 @@ class TFIDF_Model:
def __init__(self):
self.vectorizer = TfidfVectorizer(max_features=10000)
# Train model
def fit_to_corpus(self, texts):
self.vectorizer.fit(texts)
def from_file(self, filename, model_loader: model_interaction.SKLearnModelFileInteraction):
# Load TFIDF from file
def from_file(self, filename, model_loader: model_interaction.FileInteraction):
self.vectorizer = model_loader.load_from_file(filename)
# Save TFIDF to file
def to_file(self, filename):
joblib.dump(self.vectorizer, filename)
# Get vector for a single text
def vectorize_description(self, description):
return self.vectorizer.transform([description]).toarray()
# Get vectors for multiple texts
def vectorize_descriptions(self, descriptions):
return self.vectorizer.transform(descriptions).toarray()
......@@ -61,15 +69,15 @@ class TFIDF_Model:
X_train, X_test, y_train, y_test = tfidf.split_dataset(0.1, vectorised_descriptions, dataset['Priority'].tolist())
logreg = ML_classifiers.ITMultinomialLogisticRegression()
logreg.use_preconfigured_model('logreg_model.joblib', model_interaction.SKLearnModelFileInteraction())
# logreg.use_preconfigured_model('logreg_model.joblib', model_interaction.SKLearnModelFileInteraction())
logreg.use_preconfigured_model('tfidf_to_logreg_model.joblib', model_interaction.SKLearnModelFileInteraction())
# logreg.use_preconfigured_model('tfidf_to_logreg_model.joblib', model_interaction.SKLearnModelFileInteraction())
print('Training Model')
logreg.train_model()
joblib.dump(logreg, "logreg_model.joblib")
joblib.dump(logreg, "tfidf_to_logreg_model.joblib")
print("finished!")
# print(X_train, X_test)
# logreg.train_model(X_train, y_train)
# logreg.save_model('logreg_model.joblib', model_interaction.SKLearnModelFileInteraction())
# logreg.save_model('tfidf_to_logreg_model.joblib', model_interaction.SKLearnModelFileInteraction())
label_predictions = logreg.make_predictions(X_test)
# print('Made Predictions') #classification_report(tfidf.testing_labels, label_predictions))
......@@ -95,16 +103,24 @@ if __name__ == '__main__':
tfidf.fit_to_corpus(dataset['Description'].tolist())
tfidf.to_file('tfidf_model.joblib')'''
# Get Dataset
'''dataset = my_datasets.ITSupportDatasetBuilder() \
dataset = my_datasets.ITSupportDatasetBuilder(
f"{DefaultConfig.absolute_project_root_path()}/project_utilities/Datasets/ITSupport_Tickets.csv",
f"{DefaultConfig.absolute_project_root_path()}/project_utilities/Datasets/ITSupport_Tickets_High_Prio.csv",
f"{DefaultConfig.absolute_project_root_path()}/project_utilities/Datasets/synonym_IT_tickets.csv") \
.with_summaries_and_descriptions_combined() \
.with_overall_priority_column() \
.with_pre_processed_descriptions() \
.build().corpus
tfidf = ITSupportTFIDFImplementation(dataset)
tfidf.vectorize_descriptions()
logreg = joblib.load("logreg_model.joblib")
IT_issue = input("Enter IT issue to be prioritised: ")
preprocessed_input = tfidf.vectorize_description(IT_issue)
label_predictions = logreg.make_predictions(preprocessed_input)
print(label_predictions)'''
tfidf = TFIDF_Model()
X = dataset['Description'].tolist()
y = dataset['Priority'].tolist()
X_train, X_test, y_train, y_test = tfidf.split_dataset(0.1, X, y)
#tfidf.vectorize_descriptions(X_train)
tfidf.fit_to_corpus(X_train)
tfidf.to_file('tfidf_larger_model.joblib')
#logreg = joblib.load("tfidf_to_logreg_model.joblib")
#IT_issue = input("Enter IT issue to be prioritised: ")
#preprocessed_input = tfidf.vectorize_description(IT_issue)
#label_predictions = logreg.make_predictions(preprocessed_input)
#print(label_predictions)
from gensim.models import Word2Vec
from project_utilities import my_datasets
from projectsettings import DefaultConfig
import numpy as np
from custom_models.classifiers.ML_classifiers import ITMultinomialLogisticRegression
from custom_models.feature_selection_extraction.algorithmic_feature_extraction_selection import TFIDF_Model
from project_utilities.evaluators import DetailedConfusionMatrix, AccuracyPerClass
# Load Dataset
dataset = my_datasets.ITSupportDatasetBuilder(
f"{DefaultConfig.absolute_project_root_path()}/project_utilities/Datasets/ITSupport_Tickets.csv",
f"{DefaultConfig.absolute_project_root_path()}/project_utilities/Datasets/ITSupport_Tickets_High_Prio.csv",
f"{DefaultConfig.absolute_project_root_path()}/project_utilities/Datasets/synonym_IT_tickets.csv") \
.with_summaries_and_descriptions_combined() \
.with_overall_priority_column() \
.with_pre_processed_descriptions() \
.build().corpus
dataset['Description'] = dataset['Description'].apply(lambda x: x.split(' '))
# Split dataset into test and train
X_train_str, X_test_str, y_train, y_test = TFIDF_Model.split_dataset(0.1, dataset['Description'].tolist(),
dataset['Priority'].tolist())
# Create and train the Word2Vec model
model = Word2Vec(sentences=X_train_str, vector_size=250, window=5, min_count=3, workers=16)
# Save the model
#model.save("word2vec.model")
def get_vectors(texts):
X_vectors = []
for sentence in texts:
sentence_vectors = []
for word in sentence:
if word in model.wv:
sentence_vectors.append(model.wv[word])
else:
# Handle words not in the vocabulary
sentence_vectors.append(np.zeros(model.vector_size))
X_vectors.append(np.mean(sentence_vectors, axis=0))
X_vectors = np.array(X_vectors)
return X_vectors
X_vectors = get_vectors(X_train_str)
X_test_vectors = get_vectors(X_test_str)
logreg = ITMultinomialLogisticRegression()
logreg.train_model(X_vectors, y_train)
pred = logreg.make_predictions(X_test_vectors)
# Represent accuracies
confusion_matrix = DetailedConfusionMatrix(pred, y_test, ['P5', 'P4', 'P3', 'P2', 'P1'])
confusion_matrix.plot_confusion_matrix(fullscreen_requested=True)
apc = AccuracyPerClass(pred, y_test, ['P5', 'P4', 'P3', 'P2', 'P1'])
apc.plot_confusion_matrix()
\ No newline at end of file
import ITSupportTicketPrioritisers.FromCSV_TFIDF_KerasCNN_ToCSV
#import ITSupportTicketPrioritisers.FromCSV_TFIDF_KerasCNN_ToCSV
#import ITSupportTicketPrioritisers.DefaultDatasets_TFIDF_KerasCNN_ToCSV
#import ITSupportTicketPrioritisers.NoDataset_TFIDF_KerasCNN
#import ITSupportTicketPrioritisers.DefaultDatasets_TFIDF_SKLearnLogReg_ToCSV
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical
from custom_models.classifiers.DL_classifiers import KerasCNN
from custom_models.feature_selection_extraction.algorithmic_feature_extraction_selection import TFIDF_Model
from project_utilities.evaluators import DetailedConfusionMatrix, AccuracyPerClass
from project_utilities.model_interaction import SKLearnModelFileInteraction, KerasModelFileInteraction
from project_utilities import predictionformats
from project_utilities.my_datasets import ITSupportDatasetBuilder
from projectsettings import DefaultConfig
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences, to_categorical
from keras.models import Sequential
from keras.layers import Dense, Flatten, Embedding, Conv1D, MaxPooling1D, Dropout, LSTM
from project_utilities import my_datasets
from keras.callbacks import EarlyStopping
from project_utilities import evaluators
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from keras import layers
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import Adam
from keras.regularizers import l2
# Load Dataset
dataset = ITSupportDatasetBuilder(f"{DefaultConfig.absolute_project_root_path()}/project_utilities/Datasets/ITSupport_Tickets.csv",
f"{DefaultConfig.absolute_project_root_path()}/project_utilities/Datasets/ITSupport_Tickets_High_Prio.csv") \
.with_summaries_and_descriptions_combined() \
.with_overall_priority_column() \
.with_pre_processed_descriptions() \
.build().corpus
# Load Pre-configured TF-IDF
TFIDF_model = TFIDF_Model()
TFIDF_model.from_file(
f'{DefaultConfig.absolute_project_root_path()}/custom_models/preconfigured_models/tfidf_model.joblib',
SKLearnModelFileInteraction())
# Split dataset into test and train
X_train_str, X_test_str, y_train, y_test = TFIDF_model.split_dataset(0.1, dataset['Description'].tolist(),
dataset['Priority'].tolist())
X_train_tfidf = TFIDF_model.vectorize_descriptions(X_train_str)
X_test_tfidf = TFIDF_model.vectorize_descriptions(X_test_str)
# Encode class labels
encoder = LabelEncoder()
encoder.fit(['P5', 'P4', 'P3', 'P2', 'P1'])
y_train = encoder.transform(y_train)
y_val = encoder.transform(y_test)
y_train = to_categorical(y_train)
y_val = to_categorical(y_val)
input_dim = X_train_tfidf.shape[1]
model = Sequential()
model.add(Dense(1024, activation='relu', input_shape=(input_dim,), kernel_regularizer=l2(0.01)))
model.add(Dropout(0.60))
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.50))
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.40))
model.add(Dense(5, activation='softmax'))
# Compile model
opt = Adam(lr=0.001)
model.compile(loss='categorical_crossentropy', metrics=['accuracy'], optimizer=opt)
# Train model
from keras.callbacks import EarlyStopping
# Define early stopping callback
early_stopping = EarlyStopping(monitor='val_loss', patience=5)
model.fit(X_train_tfidf, y_train, epochs=50, batch_size=50, validation_data=(X_test_tfidf, y_val), callbacks=[early_stopping])
model.save('CNN_model_larger_regularised.h5')
print("finished")
'''# Make predictions
encoded_predictions = CNN_model.make_predictions(X_test)
decoded_predictions = encoder.inverse_transform(encoded_predictions.argmax(axis=1))
# Represent accuracies
confusion_matrix = DetailedConfusionMatrix(decoded_predictions, y_test, ['P5', 'P4', 'P3', 'P2', 'P1'])
confusion_matrix.plot_confusion_matrix(fullscreen_requested=True)
apc = AccuracyPerClass(decoded_predictions, y_test, ['P5', 'P4', 'P3', 'P2', 'P1'])
apc.plot_confusion_matrix()'''
\ No newline at end of file
from abc import ABC, abstractmethod
from project_utilities import model_interaction
from keras.models import load_model
from gensim.models.doc2vec import Doc2Vec
from gensim.models.word2vec import Word2Vec
class SKLearnMachineLearningModel(ABC):
......@@ -10,15 +11,19 @@ class SKLearnMachineLearningModel(ABC):
def __init__(self, model=None):
self.model = model
def use_preconfigured_model(self, filename, model_loader: model_interaction.SKLearnModelFileInteraction):
# Load model from file
def use_preconfigured_model(self, filename, model_loader: model_interaction.FileInteraction):
self.model = model_loader.load_from_file(filename)
def save_model(self, filename, model_loader: model_interaction.SKLearnModelFileInteraction):
# Load model to file
def save_model(self, filename, model_loader: model_interaction.FileInteraction):
model_loader.load_to_file(self.model, filename)
# Train model
def train_model(self, vectors, labels):
self.model.fit(vectors, labels)
# Given items, predict priority
def make_predictions(self, items):
return self.model.predict(items)
......@@ -29,10 +34,10 @@ class KerasDeepLearningModel(ABC):
def __init__(self, model=None):
self.model = model
def from_file(self, filename, model_loader: model_interaction.KerasModelFileInteraction):
def from_file(self, filename, model_loader: model_interaction.FileInteraction):
self.model = model_loader.load_from_file(filename)
def to_file(self, filename, model_loader: model_interaction.KerasModelFileInteraction):
def to_file(self, filename, model_loader: model_interaction.FileInteraction):
model_loader.load_to_file(self.model, filename)
def add_model_config(self, layer):
......@@ -48,3 +53,17 @@ class KerasDeepLearningModel(ABC):
@abstractmethod
def make_predictions(self, vectors):
pass
class GensimWordEmbeddingModel(ABC):
model: Doc2Vec or Word2Vec
def __init__(self, model=None):
self.model = model
def from_file(self, filename, model_loader: model_interaction.GensimWordEmbeddingModelFileInteraction):
self.model = model_loader.load_from_file(filename)
print(self.model)
def to_file(self, filename, model_loader: model_interaction.GensimWordEmbeddingModelFileInteraction):
model_loader.load_to_file(self.model, filename)
from abc import ABC, abstractmethod
import joblib
from keras.models import load_model, save_model
from gensim.models.doc2vec import Doc2Vec
class FileInteraction(ABC):
......@@ -34,3 +35,14 @@ class KerasModelFileInteraction(FileInteraction):
@staticmethod
def load_to_file(model, filename):
save_model(model, filename)
class GensimWordEmbeddingModelFileInteraction(FileInteraction):
@staticmethod
def load_from_file(filename):
return Doc2Vec.load(filename)
@staticmethod
def load_to_file(model, filename):
model.save(filename)
import pandas
from pandas import read_csv, DataFrame, concat, read_json, read_excel
from dataclasses import dataclass
import preprocessing_functionality
from projectsettings import DefaultConfig
import nlpaug.augmenter.word as naw
from bs4 import BeautifulSoup
@dataclass
class ITSupportDatasetWithBuilder:
......@@ -13,23 +15,15 @@ class ITSupportDatasetWithBuilder:
corpus = DataFrame
def __init__(self, *dataset_paths):
self.__get_raw_dataset(*dataset_paths)
self.__remove_nulls()
def __get_raw_dataset(self, *other_dataset_paths):
if not other_dataset_paths:
ticket_data_low_prio = read_csv(
f"{DefaultConfig.absolute_project_root_path()}/project_utilities/Datasets/ITSupport_Tickets.csv")
ticket_data_high_prio = read_csv(
f"{DefaultConfig.absolute_project_root_path()}/project_utilities/Datasets/ITSupport_Tickets_High_Prio.csv")
datasets = [ticket_data_low_prio, ticket_data_high_prio]
datasets = [self.load_from_file(file) for file in dataset_paths]
if len(datasets) > 1:
self.corpus = concat(datasets)
else:
datasets = [self.load_from_file(file) for file in other_dataset_paths]
self.corpus = concat(datasets)
print(self.corpus)
self.corpus = datasets[0]
self.__remove_nulls()
@staticmethod
def load_from_file(filename):
def load_from_file(filename: str) -> pandas.DataFrame:
filetype = filename.split('.')[1].lower()
filetypes = {'csv': read_csv,
'xlsx': read_excel,
......@@ -94,7 +88,12 @@ class ITSupportDatasetBuilder:
def build(self):
return self._dataset
def generate_synonyms(dataset: DataFrame, filename):
# Create an instance of the SynonymAug class
aug = naw.SynonymAug(aug_src='wordnet', verbose=True)
copied_dataset = dataset.copy()
copied_dataset['Description'] = copied_dataset['Description'].apply(lambda doc: aug.augment(doc)[0])
copied_dataset.to_csv(filename)
'''@dataclass
class ITSupportDataset:
"""Class for storing the IT Support Ticket Descriptions, Impacts, Urgencies, and Overall Priority"""
......@@ -204,8 +203,18 @@ if __name__ == '__main__':
'\\Datasets\\ITSupport_Tickets_High_Prio.csv')
corpus = concat([ticket_data_low_prio, ticket_data_high_prio])
corpus.to_pickle('corpus.pickle')'''
dataset = ITSupportDatasetBuilder().with_summaries_and_descriptions_combined().with_overall_priority_column().build()
'''dataset = ITSupportDatasetBuilder().with_summaries_and_descriptions_combined().with_overall_priority_column().build()
print(dataset.corpus.shape)
dataset.corpus = dataset.corpus.reset_index().drop_duplicates(subset='index', keep='first').set_index('index')
print(dataset.corpus.shape)
print(dataset.corpus.loc[1])
print(dataset.corpus.loc[1])'''
# Load Dataset
dataset = ITSupportDatasetBuilder(
f"{DefaultConfig.absolute_project_root_path()}/project_utilities/Datasets/ITSupport_Tickets.csv",
f"{DefaultConfig.absolute_project_root_path()}/project_utilities/Datasets/ITSupport_Tickets_High_Prio.csv") \
.with_summaries_and_descriptions_combined() \
.with_overall_priority_column() \
.build().corpus
print(dataset.shape)
#generate_synonyms(dataset, 'Datasets/synonym_IT_tickets.csv')
......@@ -8,15 +8,19 @@ lemmatizer = nltk.stem.WordNetLemmatizer()
stemmer = nltk.PorterStemmer()
def clean_text(text):
def clean_text(text: str) -> str:
# Strip HTML & XML
text = BeautifulSoup(text, "lxml").text
text = re.sub(r'\|\|\|', r' ', text)
# Strip Hyperlinks & URLs
text = re.sub(r'http\S+', r'<URL>', text)
text = text.lower()
# Strip Non-word characters
text = re.sub(r'[^\w\s]', '', text)
# Convert string to lowercase
text = text.replace('x', '')
text = text.lower()
return text
def tokenize_text(text):
untokenized_sentences = nltk.sent_tokenize(text)
tokenized_sentences = [tokenize_sentence(sentence) for sentence in untokenized_sentences]
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment