Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
C
COM6001M Computer Science Major Project
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
benjamin.clough
COM6001M Computer Science Major Project
Commits
1d16a38d
Commit
1d16a38d
authored
May 06, 2023
by
Benjamin
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
almost done
parent
4c72cb41
Changes
15
Hide whitespace changes
Inline
Side-by-side
Showing
15 changed files
with
468 additions
and
122 deletions
+468
-122
DISS DIAG.drawio.png
DISS DIAG.drawio.png
+0
-0
Doc2Vec to LogReg (Have to train).py
...rtTicketPrioritisers/Doc2Vec to LogReg (Have to train).py
+60
-0
TFIDF to CNN (Accuracy showcase).py
...ortTicketPrioritisers/TFIDF to CNN (Accuracy showcase).py
+11
-8
TFIDF to CNN (CSV to CSV).py
ITSupportTicketPrioritisers/TFIDF to CNN (CSV to CSV).py
+2
-2
TFIDF to CNN (Question Prompt).py
...pportTicketPrioritisers/TFIDF to CNN (Question Prompt).py
+2
-2
TFIDF to LogReg.py
ITSupportTicketPrioritisers/TFIDF to LogReg.py
+67
-0
ML_classifiers.py
custom_models/classifiers/ML_classifiers.py
+27
-5
ML_DL_feature_extraction_selection.py
...election_extraction/ML_DL_feature_extraction_selection.py
+50
-64
algorithmic_feature_extraction_selection.py
...on_extraction/algorithmic_feature_extraction_selection.py
+30
-14
new_Word2Vec_train.py
dump_scripts/new_Word2Vec_train.py
+56
-0
main.py
main.py
+93
-1
ModelTemplates.py
project_utilities/ModelTemplates.py
+24
-5
model_interaction.py
project_utilities/model_interaction.py
+12
-0
my_datasets.py
project_utilities/my_datasets.py
+27
-18
preprocessing_functionality.py
project_utilities/preprocessing_functionality.py
+7
-3
No files found.
DISS DIAG.drawio.png
0 → 100644
View file @
1d16a38d
17.4 KB
ITSupportTicketPrioritisers/Doc2Vec to LogReg (Have to train).py
0 → 100644
View file @
1d16a38d
from
sklearn.preprocessing
import
LabelEncoder
from
keras.utils
import
to_categorical
import
custom_models.classifiers.ML_classifiers
from
custom_models.classifiers.DL_classifiers
import
KerasCNN
from
custom_models.feature_selection_extraction.algorithmic_feature_extraction_selection
import
TFIDF_Model
from
custom_models.feature_selection_extraction.ML_DL_feature_extraction_selection
import
ITSupportDoc2VecImplementation
,
Doc2VecModels
from
project_utilities.evaluators
import
DetailedConfusionMatrix
,
AccuracyPerClass
from
project_utilities.model_interaction
import
SKLearnModelFileInteraction
,
KerasModelFileInteraction
,
GensimWordEmbeddingModelFileInteraction
from
project_utilities
import
predictionformats
from
project_utilities.my_datasets
import
ITSupportDatasetBuilder
from
projectsettings
import
DefaultConfig
import
pandas
as
pd
# Load Dataset
dataset
=
ITSupportDatasetBuilder
(
f
"{DefaultConfig.absolute_project_root_path()}/project_utilities/Datasets/ITSupport_Tickets.csv"
,
f
"{DefaultConfig.absolute_project_root_path()}/project_utilities/Datasets/ITSupport_Tickets_High_Prio.csv"
,
f
"{DefaultConfig.absolute_project_root_path()}/project_utilities/Datasets/synonym_IT_tickets.csv"
)
\
.
with_summaries_and_descriptions_combined
()
\
.
with_overall_priority_column
()
\
.
with_pre_processed_descriptions
()
\
.
build
()
.
corpus
# Split dataset into test and train
X_train_str
,
X_test_str
,
y_train
,
y_test
=
TFIDF_Model
.
split_dataset
(
0.1
,
dataset
[
'Description'
]
.
tolist
(),
dataset
[
'Priority'
]
.
tolist
())
# Get pre-configured doc2vec model
doc2vec_model
=
ITSupportDoc2VecImplementation
(
Doc2VecModels
.
DBOW
)
'''doc2vec_model.from_file(
f"{DefaultConfig.absolute_project_root_path()}/custom_models/preconfigured_models/doc2vec_model.model",
GensimWordEmbeddingModelFileInteraction())'''
tagged_training_documents
=
doc2vec_model
.
tag_documents
(
pd
.
DataFrame
({
'Description'
:
X_train_str
,
'Priority'
:
y_train
}))
tagged_testing_documents
=
doc2vec_model
.
tag_documents
(
pd
.
DataFrame
({
'Description'
:
X_test_str
,
'Priority'
:
y_test
}))
doc2vec_model
.
build_vocabulary
(
tagged_training_documents
)
doc2vec_model
.
train_model
(
tagged_training_documents
,
dataset_shuffles
=
10
,
epochs
=
10
)
#doc2vec_model.to_file("doc2vec_model.model", model_interaction.GensimWordEmbeddingModelFileInteraction())
#tagged_descriptions = doc2vec_model.tag_documents(X_test_str)
X_train
=
doc2vec_model
.
vectorize_documents
(
X_train_str
)
X_test
=
doc2vec_model
.
vectorize_documents
(
X_test_str
)
# Load Logistic Regression model
logreg_model
=
custom_models
.
classifiers
.
ML_classifiers
.
ITMultinomialLogisticRegression
(
cores_allocated
=
1
)
'''logreg_model.use_preconfigured_model(
f'{DefaultConfig.absolute_project_root_path()}/custom_models/preconfigured_models/doc2vec_to_logreg_model.joblib',
SKLearnModelFileInteraction())'''
logreg_model
.
train_model
(
vectors
=
X_train
,
labels
=
y_train
)
#logreg_model.save_model('doc2vec_to_logreg_model.joblib', SKLearnModelFileInteraction())
# Make predictions
predictions
=
logreg_model
.
make_predictions
(
X_test
)
# Represent accuracies
confusion_matrix
=
DetailedConfusionMatrix
(
predictions
,
y_test
,
[
'P5'
,
'P4'
,
'P3'
,
'P2'
,
'P1'
])
confusion_matrix
.
plot_confusion_matrix
(
fullscreen_requested
=
True
)
apc
=
AccuracyPerClass
(
predictions
,
y_test
,
[
'P5'
,
'P4'
,
'P3'
,
'P2'
,
'P1'
])
apc
.
plot_confusion_matrix
()
ITSupportTicketPrioritisers/
DefaultDatasets_TFIDF_KerasCNN_ToCSV
.py
→
ITSupportTicketPrioritisers/
TFIDF to CNN (Accuracy showcase)
.py
View file @
1d16a38d
...
...
@@ -11,22 +11,25 @@ from projectsettings import DefaultConfig
import
pandas
as
pd
# Load Dataset
dataset
=
ITSupportDatasetBuilder
(
f
"{DefaultConfig.absolute_project_root_path()}/project_utilities/Datasets/ITSupport_Tickets.csv"
,
f
"{DefaultConfig.absolute_project_root_path()}/project_utilities/Datasets/ITSupport_Tickets_High_Prio.csv"
)
\
dataset
=
ITSupportDatasetBuilder
(
f
"{DefaultConfig.absolute_project_root_path()}/project_utilities/Datasets/ITSupport_Tickets.csv"
,
f
"{DefaultConfig.absolute_project_root_path()}/project_utilities/Datasets/ITSupport_Tickets_High_Prio.csv"
,
f
"{DefaultConfig.absolute_project_root_path()}/project_utilities/Datasets/synonym_IT_tickets.csv"
)
\
.
with_summaries_and_descriptions_combined
()
\
.
with_overall_priority_column
()
\
.
with_pre_processed_descriptions
()
\
.
build
()
.
corpus
# Load Pre-configured TF-IDF
TFIDF_model
=
TFIDF_Model
()
TFIDF_model
.
from_file
(
f
'{DefaultConfig.absolute_project_root_path()}/custom_models/preconfigured_models/tfidf_model.joblib'
,
f
'{DefaultConfig.absolute_project_root_path()}/custom_models/preconfigured_models/tfidf_
larger_model.joblib'
,
#_larger_
model.joblib',
SKLearnModelFileInteraction
())
# Load Pre-configured Keras CNN
CNN_model
=
KerasCNN
()
CNN_model
.
from_file
(
f
'{DefaultConfig.absolute_project_root_path()}/custom_models/preconfigured_models/CNN_model.h5'
,
CNN_model
.
from_file
(
f
'{DefaultConfig.absolute_project_root_path()}/custom_models/preconfigured_models/CNN_model
_deeper
.h5'
,
KerasModelFileInteraction
())
# Split dataset into test and train
...
...
@@ -47,16 +50,16 @@ encoded_predictions = CNN_model.make_predictions(X_test)
decoded_predictions
=
encoder
.
inverse_transform
(
encoded_predictions
.
argmax
(
axis
=
1
))
# Represent accuracies
'''
confusion_matrix = DetailedConfusionMatrix(decoded_predictions, y_test, ['P5', 'P4', 'P3', 'P2', 'P1'])
confusion_matrix
=
DetailedConfusionMatrix
(
decoded_predictions
,
y_test
,
[
'P5'
,
'P4'
,
'P3'
,
'P2'
,
'P1'
])
confusion_matrix
.
plot_confusion_matrix
(
fullscreen_requested
=
True
)
apc
=
AccuracyPerClass
(
decoded_predictions
,
y_test
,
[
'P5'
,
'P4'
,
'P3'
,
'P2'
,
'P1'
])
apc.plot_confusion_matrix()
'''
apc
.
plot_confusion_matrix
()
# export predictions to file
dict_descriptions_predictions
=
{
'Description'
:
X_test_str
,
'PredictedPriority'
:
decoded_predictions
}
'''
dict_descriptions_predictions = {'Description': X_test_str, 'PredictedPriority': decoded_predictions}
formatted_predictions = pd.DataFrame(dict_descriptions_predictions)
prediction_saver = predictionformats.ITSupportPredictionFormat()
prediction_saver.load_predictions(formatted_predictions)
filename = input("Enter filename: ")
prediction_saver
.
save_predictions_to_file
(
filename
,
'csv'
)
\ No newline at end of file
prediction_saver.save_predictions_to_file(filename, 'csv')'''
\ No newline at end of file
ITSupportTicketPrioritisers/
FromCSV_TFIDF_KerasCNN_ToCSV
.py
→
ITSupportTicketPrioritisers/
TFIDF to CNN (CSV to CSV)
.py
View file @
1d16a38d
...
...
@@ -21,11 +21,11 @@ dataset = ITSupportDatasetBuilder(dataset_file_loc) \
TFIDF_model
=
TFIDF_Model
()
TFIDF_model
.
from_file
(
f
'{DefaultConfig.absolute_project_root_path()}/custom_models/preconfigured_models/tfidf_model.joblib'
,
f
'{DefaultConfig.absolute_project_root_path()}/custom_models/preconfigured_models/tfidf_
larger_
model.joblib'
,
SKLearnModelFileInteraction
())
CNN_model
=
KerasCNN
()
CNN_model
.
from_file
(
f
'{DefaultConfig.absolute_project_root_path()}/custom_models/preconfigured_models/CNN_model.h5'
,
CNN_model
.
from_file
(
f
'{DefaultConfig.absolute_project_root_path()}/custom_models/preconfigured_models/CNN_model
_deeper
.h5'
,
KerasModelFileInteraction
())
#X_train_str, X_test_str, y_train, y_test = TFIDF_model.split_dataset(0.1, dataset['Description'].tolist(),
...
...
ITSupportTicketPrioritisers/
NoDataset_TFIDF_KerasCNN
.py
→
ITSupportTicketPrioritisers/
TFIDF to CNN (Question Prompt)
.py
View file @
1d16a38d
...
...
@@ -9,12 +9,12 @@ import sys
# Load Pre-configured TF-IDF
TFIDF_model
=
TFIDF_Model
()
TFIDF_model
.
from_file
(
f
'{DefaultConfig.absolute_project_root_path()}/custom_models/preconfigured_models/tfidf_model.joblib'
,
f
'{DefaultConfig.absolute_project_root_path()}/custom_models/preconfigured_models/tfidf_
larger_
model.joblib'
,
SKLearnModelFileInteraction
())
# Load Pre-configured Keras CNN
CNN_model
=
KerasCNN
()
CNN_model
.
from_file
(
f
'{DefaultConfig.absolute_project_root_path()}/custom_models/preconfigured_models/CNN_model.h5'
,
CNN_model
.
from_file
(
f
'{DefaultConfig.absolute_project_root_path()}/custom_models/preconfigured_models/CNN_model
_deeper
.h5'
,
KerasModelFileInteraction
())
# Convert P1-5 into categories the model understands
...
...
ITSupportTicketPrioritisers/TFIDF to LogReg.py
0 → 100644
View file @
1d16a38d
from
sklearn.preprocessing
import
LabelEncoder
from
keras.utils
import
to_categorical
from
custom_models.classifiers.DL_classifiers
import
KerasCNN
from
custom_models.classifiers.ML_classifiers
import
ITMultinomialLogisticRegression
from
custom_models.feature_selection_extraction.algorithmic_feature_extraction_selection
import
TFIDF_Model
from
project_utilities.evaluators
import
DetailedConfusionMatrix
,
AccuracyPerClass
from
project_utilities.model_interaction
import
SKLearnModelFileInteraction
,
KerasModelFileInteraction
from
project_utilities
import
predictionformats
from
project_utilities.my_datasets
import
ITSupportDatasetBuilder
from
projectsettings
import
DefaultConfig
import
pandas
as
pd
# Load Dataset
dataset
=
ITSupportDatasetBuilder
(
f
"{DefaultConfig.absolute_project_root_path()}/project_utilities/Datasets/ITSupport_Tickets.csv"
,
f
"{DefaultConfig.absolute_project_root_path()}/project_utilities/Datasets/ITSupport_Tickets_High_Prio.csv"
,
f
"{DefaultConfig.absolute_project_root_path()}/project_utilities/Datasets/synonym_IT_tickets.csv"
)
\
.
with_summaries_and_descriptions_combined
()
\
.
with_overall_priority_column
()
\
.
with_pre_processed_descriptions
()
\
.
build
()
.
corpus
# Load Pre-configured TF-IDF
TFIDF_model
=
TFIDF_Model
()
TFIDF_model
.
from_file
(
f
'{DefaultConfig.absolute_project_root_path()}/custom_models/preconfigured_models/tfidf_larger_model.joblib'
,
SKLearnModelFileInteraction
())
# Load Logistic Regression model
logreg_model
=
ITMultinomialLogisticRegression
(
cores_allocated
=
1
)
logreg_model
.
use_preconfigured_model
(
f
'{DefaultConfig.absolute_project_root_path()}/custom_models/preconfigured_models/tfidf_to_logreg_model_larger.joblib'
,
SKLearnModelFileInteraction
())
# Split dataset into test and train
X_train_str
,
X_test_str
,
y_train
,
y_test
=
TFIDF_model
.
split_dataset
(
0.1
,
dataset
[
'Description'
]
.
tolist
(),
dataset
[
'Priority'
]
.
tolist
())
# Convert the Descriptions to Sparse Matrices, representative of text
X_test
=
TFIDF_model
.
vectorize_descriptions
(
X_test_str
)
#X_train = TFIDF_model.vectorize_descriptions(X_train_str)
#logreg_model.train_model(X_train, y_train)
# vectorized_desc = TFIDF_Model.vectorize_description(self=TFIDF_model, description="WIFI network has lost connction across the whole campus, this needs fixing ASAP")
#encoder = LabelEncoder().fit(['P5', 'P4', 'P3', 'P2', 'P1'])
'''y_train = to_categorical(encoder.transform(y_train))
y_val = to_categorical(encoder.transform(y_test))'''
# Make predictions
predictions
=
logreg_model
.
make_predictions
(
X_test
)
#decoded_predictions = encoder.inverse_transform(encoded_predictions.argmax(axis=1))
# Represent accuracies
confusion_matrix
=
DetailedConfusionMatrix
(
predictions
,
y_test
,
[
'P5'
,
'P4'
,
'P3'
,
'P2'
,
'P1'
])
confusion_matrix
.
plot_confusion_matrix
(
fullscreen_requested
=
True
)
apc
=
AccuracyPerClass
(
predictions
,
y_test
,
[
'P5'
,
'P4'
,
'P3'
,
'P2'
,
'P1'
])
apc
.
plot_confusion_matrix
()
# export predictions to file
'''dict_descriptions_predictions = {'Description': X_test_str, 'PredictedPriority': decoded_predictions}
formatted_predictions = pd.DataFrame(dict_descriptions_predictions)
prediction_saver = predictionformats.ITSupportPredictionFormat()
prediction_saver.load_predictions(formatted_predictions)
filename = input("Enter filename: ")
prediction_saver.save_predictions_to_file(filename, 'csv')'''
custom_models/classifiers/ML_classifiers.py
View file @
1d16a38d
...
...
@@ -3,15 +3,18 @@ from sklearn.naive_bayes import MultinomialNB
from
sklearn.svm
import
LinearSVC
from
sklearn.ensemble
import
RandomForestClassifier
from
project_utilities.ModelTemplates
import
SKLearnMachineLearningModel
from
custom_models.feature_selection_extraction.algorithmic_feature_extraction_selection
import
TFIDF_Model
from
projectsettings
import
DefaultConfig
from
project_utilities.model_interaction
import
SKLearnModelFileInteraction
from
project_utilities
import
my_datasets
class
ITMultinomialLogisticRegression
(
SKLearnMachineLearningModel
):
def
__init__
(
self
,
inverse_regularisation_strength
:
float
=
1e5
,
cores_allocated
:
int
=
1
):
super
()
.
__init__
(
LogisticRegression
(
n_jobs
=
cores_allocated
,
C
=
inverse_regularisation_strength
,
multi_class
=
'multinomial'
,
solver
=
'newton-cg'
,
verbose
=
1
))
verbose
=
1
,
max_iter
=
10000
))
class
ITMultinomialNaiveBayes
(
SKLearnMachineLearningModel
):
...
...
@@ -30,5 +33,24 @@ class ITRandomForestClassifier(SKLearnMachineLearningModel):
if
__name__
==
"__main__"
:
# logreg = ITMultinomialLogisticRegression(6, 1e5)
pass
# Get Dataset
dataset
=
my_datasets
.
ITSupportDatasetBuilder
(
f
"{DefaultConfig.absolute_project_root_path()}/project_utilities/Datasets/ITSupport_Tickets.csv"
,
f
"{DefaultConfig.absolute_project_root_path()}/project_utilities/Datasets/ITSupport_Tickets_High_Prio.csv"
,
f
"{DefaultConfig.absolute_project_root_path()}/project_utilities/Datasets/synonym_IT_tickets.csv"
)
\
.
with_summaries_and_descriptions_combined
()
\
.
with_overall_priority_column
()
\
.
with_pre_processed_descriptions
()
\
.
build
()
.
corpus
logreg
=
ITMultinomialLogisticRegression
(
1e5
,
6
)
tfidf
=
TFIDF_Model
()
tfidf
.
from_file
(
f
"{DefaultConfig.absolute_project_root_path()}/custom_models/preconfigured_models/tfidf_larger_model.joblib"
,
SKLearnModelFileInteraction
())
X
=
dataset
[
'Description'
]
.
tolist
()
y
=
dataset
[
'Priority'
]
.
tolist
()
X_train_str
,
X_test_str
,
y_train
,
y_test
=
tfidf
.
split_dataset
(
0.1
,
X
,
y
)
X_train
=
tfidf
.
vectorize_descriptions
(
X_train_str
)
X_test
=
tfidf
.
vectorize_descriptions
(
X_test_str
)
logreg
.
train_model
(
X_train
,
y_train
)
logreg
.
save_model
(
'tfidf_to_logreg_model.joblib'
,
SKLearnModelFileInteraction
())
custom_models/feature_selection_extraction/ML_DL_feature_extraction_selection.py
View file @
1d16a38d
...
...
@@ -9,121 +9,107 @@ from pandas import DataFrame
from
sklearn
import
utils
from
sklearn.model_selection
import
train_test_split
from
tqdm
import
tqdm
from
project_utilities
import
model_interaction
from
project_utilities.ModelTemplates
import
GensimWordEmbeddingModel
from
gensim.models.doc2vec
import
Doc2Vec
# from project_utilities import preprocessing_functionality, my_datasets
from
project_utilities
import
my_datasets
import
preprocessing_functionality
class
Doc2VecModels
(
Enum
):
DBOW
=
1
DM
=
2
COMBINED
=
3
class
ITSupportDoc2VecImplementation
:
dataset
=
DataFrame
tagged_training_documents
=
DataFrame
tagged_testing_documents
=
DataFrame
class
ITSupportDoc2VecImplementation
(
GensimWordEmbeddingModel
):
model_type
=
Doc2VecModels
model
=
gensim
.
models
.
Doc2Vec
train_descriptions
=
\
test_descriptions
=
\
train_labels
=
\
test_labels
=
tuple
def
__init__
(
self
,
dataset
,
model_type
):
self
.
dataset
=
dataset
def
__init__
(
self
,
model_type
,
alpha_change
=-
.002
):
self
.
model_type
=
model_type
self
.
alpha_change
=
None
model
=
self
.
create_model
()
self
.
alpha_change
=
alpha_change
tqdm
.
pandas
(
desc
=
"progress-bar"
)
super
()
.
__init__
(
model
)
def
split_texts
(
self
):
training_data
,
testing_data
=
train_test_split
(
self
.
dataset
,
test_size
=
0.1
,
random_state
=
1000
)
@
staticmethod
def
split_texts
(
dataset
):
training_data
,
testing_data
=
train_test_split
(
dataset
,
test_size
=
0.1
,
random_state
=
1000
)
return
training_data
,
testing_data
def
tag_documents
(
self
):
training_documents
,
testing_documents
=
self
.
split_texts
()
self
.
tagged_training_documents
=
training_documents
.
apply
(
lambda
docs
:
gensim
.
models
.
doc2vec
.
TaggedDocument
(
words
=
preprocessing_functionality
.
tokenize_text
(
docs
.
Description
),
tags
=
[
docs
.
Priority
]),
axis
=
1
)
self
.
tagged_testing_documents
=
testing_documents
.
apply
(
def
tag_documents
(
self
,
documents
)
->
DataFrame
:
tagged_documents
=
documents
.
apply
(
lambda
docs
:
gensim
.
models
.
doc2vec
.
TaggedDocument
(
words
=
preprocessing_functionality
.
tokenize_text
(
docs
.
Description
),
tags
=
[
docs
.
Priority
]),
axis
=
1
)
return
tagged_documents
def
create_model
(
self
):
cores
=
multiprocessing
.
cpu_count
()
match
self
.
model_type
:
case
Doc2VecModels
.
DBOW
:
self
.
_create_dbow_model
(
cores
)
return
self
.
_create_dbow_model
(
cores
)
case
Doc2VecModels
.
DM
:
self
.
_create_dm_model
(
cores
)
case
Doc2VecModels
.
COMBINED
:
self
.
_create_combined_model
(
cores
)
return
self
.
_create_dm_model
(
cores
)
case
_
:
raise
TypeError
(
"Must be a Doc2Vec model type (DBOW, DM, COMBINED)"
)
def
_create_dbow_model
(
self
,
cores
):
self
.
model
=
gensim
.
models
.
Doc2Vec
(
model
=
Doc2Vec
(
dm
=
0
,
vector_size
=
1000
,
negative
=
5
,
hs
=
0
,
min_count
=
2
,
sample
=
0
,
workers
=
cores
)
self
.
alpha_change
=
0.0002
return
model
def
_create_dm_model
(
self
,
cores
):
self
.
model
=
gensim
.
models
.
Doc2Vec
(
model
=
Doc2Vec
(
dm
=
1
,
dm_mean
=
1
,
vector_size
=
300
,
window
=
10
,
negative
=
5
,
min_count
=
1
,
workers
=
cores
,
alpha
=
0.065
,
min_alpha
=
0.065
)
self
.
alpha_change
=
-
0.002
return
model
def
_create_combined_model
(
self
,
cores
):
dbow_model
=
gensim
.
models
.
Doc2Vec
(
dm
=
0
,
vector_size
=
300
,
negative
=
5
,
hs
=
0
,
min_count
=
2
,
sample
=
0
,
workers
=
cores
)
dm_model
=
gensim
.
models
.
Doc2Vec
(
dm
=
1
,
dm_mean
=
1
,
vector_size
=
300
,
window
=
10
,
negative
=
5
,
min_count
=
1
,
workers
=
cores
,
alpha
=
0.065
,
min_alpha
=
0.065
)
self
.
model
=
ConcatenatedDoc2Vec
([
dbow_model
,
dm_model
])
def
build_vocabulary
(
self
):
vocabulary
=
[
x
for
x
in
tqdm
(
self
.
tagged_training_documents
.
values
)]
def
build_vocabulary
(
self
,
tagged_training_documents
):
vocabulary
=
[
x
for
x
in
tqdm
(
tagged_training_documents
.
values
)]
self
.
model
.
build_vocab
(
vocabulary
)
def
train_model
(
self
,
dataset_shuffles
:
int
=
1
,
epochs
:
int
=
1
):
def
train_model
(
self
,
tagged_training_documents
,
dataset_shuffles
:
int
=
1
,
epochs
:
int
=
1
):
for
training_round
in
range
(
dataset_shuffles
):
shuffled_training_data
=
utils
.
shuffle
([
x
for
x
in
tqdm
(
self
.
tagged_training_documents
.
values
)])
datapoint_quantity
=
len
(
self
.
tagged_training_documents
)
self
.
model
.
train
(
shuffled_training_data
,
total_examples
=
datapoint_quantity
,
epochs
=
epochs
)
# shuffle training data
shuffled_training_data
=
utils
.
shuffle
([
x
for
x
in
tqdm
(
tagged_training_documents
.
values
)]
)
dataset_size
=
len
(
tagged_training_documents
)
self
.
model
.
train
(
shuffled_training_data
,
total_examples
=
dataset_size
,
epochs
=
epochs
)
self
.
model
.
alpha
+=
self
.
alpha_change
self
.
model
.
min_alpha
=
self
.
model
.
alpha
#@numba.jit(forceobj=True)
#
@numba.jit(forceobj=True)
def
vectorize_tagged_documents
(
self
,
tagged_documents
):
sentences
=
tagged_documents
.
values
targets
,
regressors
=
zip
(
*
[(
doc
.
tags
[
0
],
self
.
model
.
infer_vector
(
doc
.
words
))
for
doc
in
sentences
])
return
targets
,
regressors
def
generate_vectors
(
self
):
self
.
train_labels
,
self
.
train_descriptions
=
self
.
vectorize_tagged_documents
(
self
.
tagged_training_documents
)
self
.
test_labels
,
self
.
test_descriptions
=
self
.
vectorize_tagged_documents
(
self
.
tagged_testing_documents
)
def
generate_training_vectors
(
self
,
tagged_documents
):
labels
,
descriptions
=
self
.
vectorize_tagged_documents
(
tagged_documents
)
return
labels
,
descriptions
def
vectorize_documents
(
self
,
documents
):
documents
=
[
document
.
split
(
' '
)
for
document
in
documents
]
return
[
self
.
model
.
infer_vector
(
document
)
for
document
in
documents
]
if
__name__
==
'__main__'
:
'''
dataset = my_datasets.ITSupportDatasetBuilder()
\
dataset
=
my_datasets
.
ITSupportDatasetBuilder
()
\
.
with_overall_priority_column
()
\
.
with_summaries_and_descriptions_combined
()
\
.
with_pre_processed_descriptions
()
\
.build()
doc2vec_IT = ITSupportDoc2VecImplementation(dataset=dataset.corpus, model_type=Doc2VecModels.DM)
#doc2vec_IT.pre_process_texts()
doc2vec_IT.tag_documents()
doc2vec_IT.create_model()
t1 = time.perf_counter()
doc2vec_IT.build_vocabulary()
doc2vec_IT.train_model(dataset_shuffles=1, epochs=1)
print("time: " + str(time.perf_counter() - t1))
doc2vec_IT.generate_vectors()
print(doc2vec_IT.tagged_training_documents[50])
#print(doc2vec_IT.X_test)'''
.
build
()
.
corpus
doc2vec_IT
=
ITSupportDoc2VecImplementation
(
model_type
=
Doc2VecModels
.
DBOW
,
alpha_change
=-
0.002
)
training_documents
,
testing_documents
=
doc2vec_IT
.
split_texts
(
dataset
)
tagged_training_documents
=
doc2vec_IT
.
tag_documents
(
training_documents
)
tagged_testing_documents
=
doc2vec_IT
.
tag_documents
(
testing_documents
)
doc2vec_IT
.
build_vocabulary
(
tagged_training_documents
)
doc2vec_IT
.
train_model
(
tagged_training_documents
,
dataset_shuffles
=
3
,
epochs
=
10
)
doc2vec_IT
.
to_file
(
"doc2vec_model.model"
,
model_interaction
.
GensimWordEmbeddingModelFileInteraction
())
#doc2vec_IT.generate_vectors()
#print(doc2vec_IT.X_test)
custom_models/feature_selection_extraction/algorithmic_feature_extraction_selection.py
View file @
1d16a38d
import
numpy
from
tqdm
import
tqdm
import
projectsettings
from
project_utilities
import
my_datasets
,
preprocessing_functionality
from
sklearn.feature_extraction.text
import
TfidfVectorizer
from
sklearn.model_selection
import
train_test_split
...
...
@@ -8,9 +10,10 @@ from sklearn.metrics import classification_report
from
project_utilities
import
evaluators
import
pandas
import
numba
from
custom_models.classifiers
import
ML_classifiers
#
from custom_models.classifiers import ML_classifiers
import
joblib
from
project_utilities
import
model_interaction
from
projectsettings
import
DefaultConfig
@
numba
.
jit
(
forceobj
=
1
)
...
...
@@ -26,18 +29,23 @@ class TFIDF_Model:
def
__init__
(
self
):
self
.
vectorizer
=
TfidfVectorizer
(
max_features
=
10000
)
# Train model
def
fit_to_corpus
(
self
,
texts
):
self
.
vectorizer
.
fit
(
texts
)
def
from_file
(
self
,
filename
,
model_loader
:
model_interaction
.
SKLearnModelFileInteraction
):
# Load TFIDF from file
def
from_file
(
self
,
filename
,
model_loader
:
model_interaction
.
FileInteraction
):
self
.
vectorizer
=
model_loader
.
load_from_file
(
filename
)
# Save TFIDF to file
def
to_file
(
self
,
filename
):
joblib
.
dump
(
self
.
vectorizer
,
filename
)
# Get vector for a single text
def
vectorize_description
(
self
,
description
):
return
self
.
vectorizer
.
transform
([
description
])
.
toarray
()
# Get vectors for multiple texts
def
vectorize_descriptions
(
self
,
descriptions
):
return
self
.
vectorizer
.
transform
(
descriptions
)
.
toarray
()
...
...
@@ -61,15 +69,15 @@ class TFIDF_Model:
X_train, X_test, y_train, y_test = tfidf.split_dataset(0.1, vectorised_descriptions, dataset['Priority'].tolist())
logreg = ML_classifiers.ITMultinomialLogisticRegression()
logreg.use_preconfigured_model('logreg_model.joblib', model_interaction.SKLearnModelFileInteraction())
# logreg.use_preconfigured_model('logreg_model.joblib', model_interaction.SKLearnModelFileInteraction())
logreg.use_preconfigured_model('
tfidf_to_
logreg_model.joblib', model_interaction.SKLearnModelFileInteraction())
# logreg.use_preconfigured_model('
tfidf_to_
logreg_model.joblib', model_interaction.SKLearnModelFileInteraction())
print('Training Model')
logreg.train_model()
joblib.dump(logreg, "logreg_model.joblib")
joblib.dump(logreg, "
tfidf_to_
logreg_model.joblib")
print("finished!")
# print(X_train, X_test)
# logreg.train_model(X_train, y_train)
# logreg.save_model('logreg_model.joblib', model_interaction.SKLearnModelFileInteraction())
# logreg.save_model('
tfidf_to_
logreg_model.joblib', model_interaction.SKLearnModelFileInteraction())
label_predictions = logreg.make_predictions(X_test)
# print('Made Predictions') #classification_report(tfidf.testing_labels, label_predictions))
...
...
@@ -95,16 +103,24 @@ if __name__ == '__main__':
tfidf.fit_to_corpus(dataset['Description'].tolist())
tfidf.to_file('tfidf_model.joblib')'''
# Get Dataset
'''dataset = my_datasets.ITSupportDatasetBuilder()
\
dataset
=
my_datasets
.
ITSupportDatasetBuilder
(
f
"{DefaultConfig.absolute_project_root_path()}/project_utilities/Datasets/ITSupport_Tickets.csv"
,
f
"{DefaultConfig.absolute_project_root_path()}/project_utilities/Datasets/ITSupport_Tickets_High_Prio.csv"
,
f
"{DefaultConfig.absolute_project_root_path()}/project_utilities/Datasets/synonym_IT_tickets.csv"
)
\
.
with_summaries_and_descriptions_combined
()
\
.
with_overall_priority_column
()
\
.
with_pre_processed_descriptions
()
\
.
build
()
.
corpus
tfidf = ITSupportTFIDFImplementation(dataset)
tfidf.vectorize_descriptions()
logreg = joblib.load("logreg_model.joblib")
IT_issue = input("Enter IT issue to be prioritised: ")
preprocessed_input = tfidf.vectorize_description(IT_issue)
label_predictions = logreg.make_predictions(preprocessed_input)
print(label_predictions)'''
tfidf
=
TFIDF_Model
()
X
=
dataset
[
'Description'
]
.
tolist
()
y
=
dataset
[
'Priority'
]
.
tolist
()
X_train
,
X_test
,
y_train
,
y_test
=
tfidf
.
split_dataset
(
0.1
,
X
,
y
)
#tfidf.vectorize_descriptions(X_train)
tfidf
.
fit_to_corpus
(
X_train
)
tfidf
.
to_file
(
'tfidf_larger_model.joblib'
)
#logreg = joblib.load("tfidf_to_logreg_model.joblib")
#IT_issue = input("Enter IT issue to be prioritised: ")
#preprocessed_input = tfidf.vectorize_description(IT_issue)
#label_predictions = logreg.make_predictions(preprocessed_input)
#print(label_predictions)
dump_scripts/new_Word2Vec_train.py
0 → 100644
View file @
1d16a38d
from
gensim.models
import
Word2Vec
from
project_utilities
import
my_datasets
from
projectsettings
import
DefaultConfig
import
numpy
as
np
from
custom_models.classifiers.ML_classifiers
import
ITMultinomialLogisticRegression
from
custom_models.feature_selection_extraction.algorithmic_feature_extraction_selection
import
TFIDF_Model
from
project_utilities.evaluators
import
DetailedConfusionMatrix
,
AccuracyPerClass
# Load Dataset
dataset
=
my_datasets
.
ITSupportDatasetBuilder
(
f
"{DefaultConfig.absolute_project_root_path()}/project_utilities/Datasets/ITSupport_Tickets.csv"
,
f
"{DefaultConfig.absolute_project_root_path()}/project_utilities/Datasets/ITSupport_Tickets_High_Prio.csv"
,
f
"{DefaultConfig.absolute_project_root_path()}/project_utilities/Datasets/synonym_IT_tickets.csv"
)
\
.
with_summaries_and_descriptions_combined
()
\
.
with_overall_priority_column
()
\
.
with_pre_processed_descriptions
()
\
.
build
()
.
corpus
dataset
[
'Description'
]
=
dataset
[
'Description'
]
.
apply
(
lambda
x
:
x
.
split
(
' '
))
# Split dataset into test and train
X_train_str
,
X_test_str
,
y_train
,
y_test
=
TFIDF_Model
.
split_dataset
(
0.1
,
dataset
[
'Description'
]
.
tolist
(),
dataset
[
'Priority'
]
.
tolist
())
# Create and train the Word2Vec model
model
=
Word2Vec
(
sentences
=
X_train_str
,
vector_size
=
250
,
window
=
5
,
min_count
=
3
,
workers
=
16
)
# Save the model
#model.save("word2vec.model")
def
get_vectors
(
texts
):
X_vectors
=
[]
for
sentence
in
texts
:
sentence_vectors
=
[]
for
word
in
sentence
:
if
word
in
model
.
wv
:
sentence_vectors
.
append
(
model
.
wv
[
word
])
else
:
# Handle words not in the vocabulary
sentence_vectors
.
append
(
np
.
zeros
(
model
.
vector_size
))
X_vectors
.
append
(
np
.
mean
(
sentence_vectors
,
axis
=
0
))
X_vectors
=
np
.
array
(
X_vectors
)
return
X_vectors
X_vectors
=
get_vectors
(
X_train_str
)
X_test_vectors
=
get_vectors
(
X_test_str
)
logreg
=
ITMultinomialLogisticRegression
()
logreg
.
train_model
(
X_vectors
,
y_train
)
pred
=
logreg
.
make_predictions
(
X_test_vectors
)
# Represent accuracies
confusion_matrix
=
DetailedConfusionMatrix
(
pred
,
y_test
,
[
'P5'
,
'P4'
,
'P3'
,
'P2'
,
'P1'
])
confusion_matrix
.
plot_confusion_matrix
(
fullscreen_requested
=
True
)
apc
=
AccuracyPerClass
(
pred
,
y_test
,
[
'P5'
,
'P4'
,
'P3'
,
'P2'
,
'P1'
])
apc
.
plot_confusion_matrix
()
\ No newline at end of file
main.py
View file @
1d16a38d
import
ITSupportTicketPrioritisers.FromCSV_TFIDF_KerasCNN_ToCSV
#
import ITSupportTicketPrioritisers.FromCSV_TFIDF_KerasCNN_ToCSV
#import ITSupportTicketPrioritisers.DefaultDatasets_TFIDF_KerasCNN_ToCSV
#import ITSupportTicketPrioritisers.NoDataset_TFIDF_KerasCNN
#import ITSupportTicketPrioritisers.DefaultDatasets_TFIDF_SKLearnLogReg_ToCSV
from
sklearn.preprocessing
import
LabelEncoder
from
keras.utils
import
to_categorical
from
custom_models.classifiers.DL_classifiers
import
KerasCNN
from
custom_models.feature_selection_extraction.algorithmic_feature_extraction_selection
import
TFIDF_Model
from
project_utilities.evaluators
import
DetailedConfusionMatrix
,
AccuracyPerClass
from
project_utilities.model_interaction
import
SKLearnModelFileInteraction
,
KerasModelFileInteraction
from
project_utilities
import
predictionformats
from
project_utilities.my_datasets
import
ITSupportDatasetBuilder
from
projectsettings
import
DefaultConfig
from
keras.preprocessing.text
import
Tokenizer
from
keras.utils
import
pad_sequences
,
to_categorical
from
keras.models
import
Sequential
from
keras.layers
import
Dense
,
Flatten
,
Embedding
,
Conv1D
,
MaxPooling1D
,
Dropout
,
LSTM
from
project_utilities
import
my_datasets
from
keras.callbacks
import
EarlyStopping
from
project_utilities
import
evaluators
import
pandas
as
pd
import
tensorflow
as
tf
from
tensorflow
import
keras
from
keras
import
layers
from
keras.models
import
Sequential
from
keras.layers
import
Dense
,
Dropout
from
keras.optimizers
import
Adam
from
keras.regularizers
import
l2
# Load Dataset
dataset
=
ITSupportDatasetBuilder
(
f
"{DefaultConfig.absolute_project_root_path()}/project_utilities/Datasets/ITSupport_Tickets.csv"
,
f
"{DefaultConfig.absolute_project_root_path()}/project_utilities/Datasets/ITSupport_Tickets_High_Prio.csv"
)
\
.
with_summaries_and_descriptions_combined
()
\
.
with_overall_priority_column
()
\
.
with_pre_processed_descriptions
()
\
.
build
()
.
corpus
# Load Pre-configured TF-IDF
TFIDF_model
=
TFIDF_Model
()
TFIDF_model
.
from_file
(
f
'{DefaultConfig.absolute_project_root_path()}/custom_models/preconfigured_models/tfidf_model.joblib'
,
SKLearnModelFileInteraction
())
# Split dataset into test and train
X_train_str
,
X_test_str
,
y_train
,
y_test
=
TFIDF_model
.
split_dataset
(
0.1
,
dataset
[
'Description'
]
.
tolist
(),
dataset
[
'Priority'
]
.
tolist
())
X_train_tfidf
=
TFIDF_model
.
vectorize_descriptions
(
X_train_str
)
X_test_tfidf
=
TFIDF_model
.
vectorize_descriptions
(
X_test_str
)
# Encode class labels
encoder
=
LabelEncoder
()
encoder
.
fit
([
'P5'
,
'P4'
,
'P3'
,
'P2'
,
'P1'
])
y_train
=
encoder
.
transform
(
y_train
)
y_val
=
encoder
.
transform
(
y_test
)
y_train
=
to_categorical
(
y_train
)
y_val
=
to_categorical
(
y_val
)
input_dim
=
X_train_tfidf
.
shape
[
1
]
model
=
Sequential
()
model
.
add
(
Dense
(
1024
,
activation
=
'relu'
,
input_shape
=
(
input_dim
,),
kernel_regularizer
=
l2
(
0.01
)))
model
.
add
(
Dropout
(
0.60
))
model
.
add
(
Dense
(
512
,
activation
=
'relu'
))
model
.
add
(
Dropout
(
0.50
))
model
.
add
(
Dense
(
256
,
activation
=
'relu'
))
model
.
add
(
Dropout
(
0.40
))
model
.
add
(
Dense
(
5
,
activation
=
'softmax'
))
# Compile model
opt
=
Adam
(
lr
=
0.001
)
model
.
compile
(
loss
=
'categorical_crossentropy'
,
metrics
=
[
'accuracy'
],
optimizer
=
opt
)
# Train model
from
keras.callbacks
import
EarlyStopping
# Define early stopping callback
early_stopping
=
EarlyStopping
(
monitor
=
'val_loss'
,
patience
=
5
)
model
.
fit
(
X_train_tfidf
,
y_train
,
epochs
=
50
,
batch_size
=
50
,
validation_data
=
(
X_test_tfidf
,
y_val
),
callbacks
=
[
early_stopping
])
model
.
save
(
'CNN_model_larger_regularised.h5'
)
print
(
"finished"
)
'''# Make predictions
encoded_predictions = CNN_model.make_predictions(X_test)
decoded_predictions = encoder.inverse_transform(encoded_predictions.argmax(axis=1))
# Represent accuracies
confusion_matrix = DetailedConfusionMatrix(decoded_predictions, y_test, ['P5', 'P4', 'P3', 'P2', 'P1'])
confusion_matrix.plot_confusion_matrix(fullscreen_requested=True)
apc = AccuracyPerClass(decoded_predictions, y_test, ['P5', 'P4', 'P3', 'P2', 'P1'])
apc.plot_confusion_matrix()'''
\ No newline at end of file
project_utilities/ModelTemplates.py
View file @
1d16a38d
from
abc
import
ABC
,
abstractmethod
from
project_utilities
import
model_interaction
from
keras.models
import
load_model
from
gensim.models.doc2vec
import
Doc2Vec
from
gensim.models.word2vec
import
Word2Vec
class
SKLearnMachineLearningModel
(
ABC
):
...
...
@@ -10,15 +11,19 @@ class SKLearnMachineLearningModel(ABC):
def
__init__
(
self
,
model
=
None
):
self
.
model
=
model
def
use_preconfigured_model
(
self
,
filename
,
model_loader
:
model_interaction
.
SKLearnModelFileInteraction
):
# Load model from file
def
use_preconfigured_model
(
self
,
filename
,
model_loader
:
model_interaction
.
FileInteraction
):
self
.
model
=
model_loader
.
load_from_file
(
filename
)
def
save_model
(
self
,
filename
,
model_loader
:
model_interaction
.
SKLearnModelFileInteraction
):
# Load model to file
def
save_model
(
self
,
filename
,
model_loader
:
model_interaction
.
FileInteraction
):
model_loader
.
load_to_file
(
self
.
model
,
filename
)
# Train model
def
train_model
(
self
,
vectors
,
labels
):
self
.
model
.
fit
(
vectors
,
labels
)
# Given items, predict priority
def
make_predictions
(
self
,
items
):
return
self
.
model
.
predict
(
items
)
...
...
@@ -29,10 +34,10 @@ class KerasDeepLearningModel(ABC):
def
__init__
(
self
,
model
=
None
):
self
.
model
=
model
def
from_file
(
self
,
filename
,
model_loader
:
model_interaction
.
KerasModel
FileInteraction
):
def
from_file
(
self
,
filename
,
model_loader
:
model_interaction
.
FileInteraction
):
self
.
model
=
model_loader
.
load_from_file
(
filename
)
def
to_file
(
self
,
filename
,
model_loader
:
model_interaction
.
KerasModel
FileInteraction
):
def
to_file
(
self
,
filename
,
model_loader
:
model_interaction
.
FileInteraction
):
model_loader
.
load_to_file
(
self
.
model
,
filename
)
def
add_model_config
(
self
,
layer
):
...
...
@@ -48,3 +53,17 @@ class KerasDeepLearningModel(ABC):
@
abstractmethod
def
make_predictions
(
self
,
vectors
):
pass
class
GensimWordEmbeddingModel
(
ABC
):
model
:
Doc2Vec
or
Word2Vec
def
__init__
(
self
,
model
=
None
):
self
.
model
=
model
def
from_file
(
self
,
filename
,
model_loader
:
model_interaction
.
GensimWordEmbeddingModelFileInteraction
):
self
.
model
=
model_loader
.
load_from_file
(
filename
)
print
(
self
.
model
)
def
to_file
(
self
,
filename
,
model_loader
:
model_interaction
.
GensimWordEmbeddingModelFileInteraction
):
model_loader
.
load_to_file
(
self
.
model
,
filename
)
project_utilities/model_interaction.py
View file @
1d16a38d
from
abc
import
ABC
,
abstractmethod
import
joblib
from
keras.models
import
load_model
,
save_model
from
gensim.models.doc2vec
import
Doc2Vec
class
FileInteraction
(
ABC
):
...
...
@@ -34,3 +35,14 @@ class KerasModelFileInteraction(FileInteraction):
@
staticmethod
def
load_to_file
(
model
,
filename
):
save_model
(
model
,
filename
)
class
GensimWordEmbeddingModelFileInteraction
(
FileInteraction
):
@
staticmethod
def
load_from_file
(
filename
):
return
Doc2Vec
.
load
(
filename
)
@
staticmethod
def
load_to_file
(
model
,
filename
):
model
.
save
(
filename
)
project_utilities/my_datasets.py
View file @
1d16a38d
import
pandas
from
pandas
import
read_csv
,
DataFrame
,
concat
,
read_json
,
read_excel
from
dataclasses
import
dataclass
import
preprocessing_functionality
from
projectsettings
import
DefaultConfig
import
nlpaug.augmenter.word
as
naw
from
bs4
import
BeautifulSoup
@
dataclass
class
ITSupportDatasetWithBuilder
:
...
...
@@ -13,23 +15,15 @@ class ITSupportDatasetWithBuilder:
corpus
=
DataFrame
def
__init__
(
self
,
*
dataset_paths
):
self
.
__get_raw_dataset
(
*
dataset_paths
)
self
.
__remove_nulls
()
def
__get_raw_dataset
(
self
,
*
other_dataset_paths
):
if
not
other_dataset_paths
:
ticket_data_low_prio
=
read_csv
(
f
"{DefaultConfig.absolute_project_root_path()}/project_utilities/Datasets/ITSupport_Tickets.csv"
)
ticket_data_high_prio
=
read_csv
(
f
"{DefaultConfig.absolute_project_root_path()}/project_utilities/Datasets/ITSupport_Tickets_High_Prio.csv"
)
datasets
=
[
ticket_data_low_prio
,
ticket_data_high_prio
]
datasets
=
[
self
.
load_from_file
(
file
)
for
file
in
dataset_paths
]
if
len
(
datasets
)
>
1
:
self
.
corpus
=
concat
(
datasets
)
else
:
datasets
=
[
self
.
load_from_file
(
file
)
for
file
in
other_dataset_paths
]
self
.
corpus
=
concat
(
datasets
)
print
(
self
.
corpus
)
self
.
corpus
=
datasets
[
0
]
self
.
__remove_nulls
()
@
staticmethod
def
load_from_file
(
filename
)
:
def
load_from_file
(
filename
:
str
)
->
pandas
.
DataFrame
:
filetype
=
filename
.
split
(
'.'
)[
1
]
.
lower
()
filetypes
=
{
'csv'
:
read_csv
,
'xlsx'
:
read_excel
,
...
...
@@ -94,7 +88,12 @@ class ITSupportDatasetBuilder:
def
build
(
self
):
return
self
.
_dataset
def
generate_synonyms
(
dataset
:
DataFrame
,
filename
):
# Create an instance of the SynonymAug class
aug
=
naw
.
SynonymAug
(
aug_src
=
'wordnet'
,
verbose
=
True
)
copied_dataset
=
dataset
.
copy
()
copied_dataset
[
'Description'
]
=
copied_dataset
[
'Description'
]
.
apply
(
lambda
doc
:
aug
.
augment
(
doc
)[
0
])
copied_dataset
.
to_csv
(
filename
)
'''@dataclass
class ITSupportDataset:
"""Class for storing the IT Support Ticket Descriptions, Impacts, Urgencies, and Overall Priority"""
...
...
@@ -204,8 +203,18 @@ if __name__ == '__main__':
'
\\
Datasets
\\
ITSupport_Tickets_High_Prio.csv')
corpus = concat([ticket_data_low_prio, ticket_data_high_prio])
corpus.to_pickle('corpus.pickle')'''
dataset
=
ITSupportDatasetBuilder
()
.
with_summaries_and_descriptions_combined
()
.
with_overall_priority_column
()
.
build
()
'''
dataset = ITSupportDatasetBuilder().with_summaries_and_descriptions_combined().with_overall_priority_column().build()
print(dataset.corpus.shape)
dataset.corpus = dataset.corpus.reset_index().drop_duplicates(subset='index', keep='first').set_index('index')
print(dataset.corpus.shape)
print
(
dataset
.
corpus
.
loc
[
1
])
print(dataset.corpus.loc[1])'''
# Load Dataset
dataset
=
ITSupportDatasetBuilder
(
f
"{DefaultConfig.absolute_project_root_path()}/project_utilities/Datasets/ITSupport_Tickets.csv"
,
f
"{DefaultConfig.absolute_project_root_path()}/project_utilities/Datasets/ITSupport_Tickets_High_Prio.csv"
)
\
.
with_summaries_and_descriptions_combined
()
\
.
with_overall_priority_column
()
\
.
build
()
.
corpus
print
(
dataset
.
shape
)
#generate_synonyms(dataset, 'Datasets/synonym_IT_tickets.csv')
project_utilities/preprocessing_functionality.py
View file @
1d16a38d
...
...
@@ -8,15 +8,19 @@ lemmatizer = nltk.stem.WordNetLemmatizer()
stemmer
=
nltk
.
PorterStemmer
()
def
clean_text
(
text
):
def
clean_text
(
text
:
str
)
->
str
:
# Strip HTML & XML
text
=
BeautifulSoup
(
text
,
"lxml"
)
.
text
text
=
re
.
sub
(
r'\|\|\|'
,
r' '
,
text
)
# Strip Hyperlinks & URLs
text
=
re
.
sub
(
r'http\S+'
,
r'<URL>'
,
text
)
text
=
text
.
lower
()
# Strip Non-word characters
text
=
re
.
sub
(
r'[^\w\s]'
,
''
,
text
)
# Convert string to lowercase
text
=
text
.
replace
(
'x'
,
''
)
text
=
text
.
lower
()
return
text
def
tokenize_text
(
text
):
untokenized_sentences
=
nltk
.
sent_tokenize
(
text
)
tokenized_sentences
=
[
tokenize_sentence
(
sentence
)
for
sentence
in
untokenized_sentences
]
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment