Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
M
Major Project Handcrafted
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Boards
Labels
Milestones
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Commits
Issue Boards
Open sidebar
jonathan.poalses
Major Project Handcrafted
Commits
0dcbce8b
Commit
0dcbce8b
authored
May 18, 2023
by
Jonathan Poalses
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
added sample data and sample expected for ml stuff
parent
70a83fdb
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
4 additions
and
302 deletions
+4
-302
dialect_nlp.clj
src/poalses/jonathan/dialect/dialect_nlp.clj
+3
-301
main.clj
src/poalses/jonathan/dialect/main.clj
+1
-1
No files found.
src/poalses/jonathan/dialect/dialect_nlp.clj
View file @
0dcbce8b
(
ns
poalses.jonathan.dialect.dialect-nlp
(
ns
poalses.jonathan.dialect.dialect-nlp
(
:require
[
clojure.datafy
:refer
[
datafy
]]
(
:require
[
dk.simongray.datalinguist
:as
dl
]))
[
dk.simongray.datalinguist
:as
dl
]
[
dk.simongray.datalinguist.triple
:refer
[
triple->datalog
]])
(
:import
[
edu.stanford.nlp.coref
CorefCoreAnnotations$CorefChainAnnotation
]))
(
def
nlp
(
def
nlp
(
dl/->pipeline
{
:annotators
[
"truecase"
(
dl/->pipeline
{
:annotators
[
"quote"
"entitymentions"
"parse"
"depparse"
"lemma"
"relation"
"tokenize"
"tokenize"
"coref"
"ner"
]}))
"openie"
"ner"
]
:quote
{
:extractUnclosedQuotes
"true"
}}))
;; Word sets that will show a sentence as being of that dialect
;; Word sets that will show a sentence as being of that dialect
...
@@ -64,20 +52,9 @@
...
@@ -64,20 +52,9 @@
(
cond
)
(
cond
)
;; Take a text sample and separate it into its sentences, then for each sentence find its dialects, and return the most common dialect
;; A sentence can have an indeterminate number of dialects associated with it, as detect-sentence-dialects can return a collection,
;;when no dialect can be detected it defaults to standard. IE if there's a sample with 3 sentences, one reads as scottish,
;;one reads as scottish and australian, and the last reads as nothing, it will return a collection containing 2 scottish keys,
;;one australian key, and one standard key, meaning it would be seen as a scottish sample.
(
defn
detect-sample-dialect
[
sample
]
(
defn
detect-sample-dialect
[
sample
]
(
first
(
last
(
sort-by
val
(
frequencies
(
flatten
(
map
detect-sentence-dialect
(
dl/sentences
(
nlp
sample
)))))))))
(
first
(
last
(
sort-by
val
(
frequencies
(
flatten
(
map
detect-sentence-dialect
(
dl/sentences
(
nlp
sample
)))))))))
;; slurp a file containing a map of all the samples and their expected dialect
;; detect each sample, collecting the results, and comparing against the expected
;; output the results into both a file, as a map of the samples and if it was successful or not,
;; and onto the command line as a fraction and percentage, with a breakdown per dialect
(
defn
detect-dialects
(
defn
detect-dialects
[
filename
]
[
filename
]
(
let
[
filedata
(
clojure.edn/read-string
(
slurp
filename
))
(
let
[
filedata
(
clojure.edn/read-string
(
slurp
filename
))
...
@@ -86,278 +63,3 @@
...
@@ -86,278 +63,3 @@
(
let
[
result-comparison
(
map
zero?
(
map
compare
key-results
expected-results
))]
(
let
[
result-comparison
(
map
zero?
(
map
compare
key-results
expected-results
))]
(
println
(
str
(
count
(
filter
true?
result-comparison
))
"/"
(
count
result-comparison
)
" correct."
)))))
(
println
(
str
(
count
(
filter
true?
result-comparison
))
"/"
(
count
result-comparison
)
" correct."
)))))
(
defn
generate-machine-data
[]
(
let
[
filedata
(
clojure.edn/read-string
(
slurp
"all_samples2.edn"
))
data
(
str
(
keys
filedata
))
value
(
str
(
vals
filedata
))]
(
spit
"sample_data.txt"
data
)
(
spit
"sample_expected.txt"
value
)))
(
comment
(
def
filedata
(
clojure.edn/read-string
(
slurp
"all_samples2.edn"
)))
(
def
key-results
(
map
detect-sample-dialect
(
keys
filedata
)))
(
def
expected-results
(
map
keyword
(
vals
filedata
)))
(
def
datayaya
(
str
(
keys
filedata
)))
(
def
whyyaya
(
str
(
vals
filedata
)))
(
def
)
(
println
datayaya
)
(
println
whyyaya
)
(
spit
"sample_data.txt"
datayaya
)
(
spit
"sample_expected.txt"
whyyaya
)
(
interleave
key-results
(
map
zero?
(
map
compare
key-results
expected-results
)))
(
zipmap
(
keys
filedata
)
(
map
zero?
(
map
compare
key-results
expected-results
)))
;; Test every annotator in the pipeline
(
map
dl/true-case
@
sentences
)
(
map
dl/quotations
@
sentences
)
(
map
dl/mentions
@
sentences
)
(
map
dl/annotation
"relation"
@
sentences
)
(
map
dl/constituency-tree
@
sentences
)
(
map
dl/constituency-tree
sentences-one
)
(
map
dl/constituency-tree
@
sentences-two
)
(
map
dl/dependency-graph
@
sentences
)
(
datafy
(
dl/dependency-graph
(
nth
sentences-one
1
)))
(
datafy
(
dl/dependency-graph
(
nth
@
sentences-two
1
)))
(
:leafVertices
(
bean
(
dl/dependency-graph
(
nth
@
sentences-two
1
))))
(
map
dl/dependency-graph
@
sentences-two
)
(
bean
(
dl/dependency-graph
@
sentences-two
))
(
map
dl/lemma
@
sentences
)
(
map
dl/lemma
sentences-one
)
(
map
dl/lemma
@
sentences-two
)
(
dl/text
(
dl/tokens
(
nth
sentences-one
1
)))
(
map
dl/tokens
@
sentences-two
)
(
->>
(
mapcat
dl/triples
@
sentences
)
(
map
triple->datalog
))
(
dl/annotation
CorefCoreAnnotations$CorefChainAnnotation
@
annotated-example
)
(
show-dependencies
)
(
def
rats
(
datafy
(
dl/dependency-graph
(
nth
sentences-one
1
))))
(
first
(
last
(
sort-by
val
(
frequencies
(
flatten
(
map
detect-sentence-dialect
(
dl/sentences
(
nlp
test-sentence-one
))))))))
(
first
(
last
(
sort-by
val
(
frequencies
(
flatten
(
map
detect-sentence-dialect
(
dl/sentences
(
nlp
test-sentence-two
))))))))
(
first
(
last
(
sort-by
val
(
frequencies
(
flatten
(
map
detect-sentence-dialect
(
dl/sentences
(
nlp
test-sentence-three
))))))))
(
first
(
last
(
sort-by
val
(
frequencies
(
flatten
(
map
detect-sentence-dialect
(
dl/sentences
(
nlp
test-sentence-four
))))))))
(
first
(
last
(
sort-by
val
(
frequencies
(
flatten
(
map
detect-sentence-dialect
(
dl/sentences
(
nlp
test-sentence-five
))))))))
(
first
(
last
(
sort-by
val
(
frequencies
(
flatten
(
map
detect-sentence-dialect
(
dl/sentences
(
nlp
test-sentence-six
))))))))
(
first
(
last
(
sort-by
val
(
frequencies
(
flatten
(
map
detect-sentence-dialect
(
dl/sentences
(
nlp
test-sentence-seven
))))))))
(
last
(
vals
rats
))
(
.getTarget
(
first
(
last
(
vals
rats
))))
(
bean
(
first
(
last
(
vals
rats
))))
(
bean
(
:relation
(
bean
(
first
(
last
(
vals
rats
))))))
(
datafy
(
dl/dependency-graph
(
nth
sentences-one
1
)))
(
datafy
(
dl/dependency-graph
(
nth
sentences-five
1
)))
(
datafy
(
dl/dependency-graph
(
nth
sentences-six
0
)))
(
datafy
(
dl/dependency-graph
(
nth
sentences-six
1
)))
(
datafy
(
dl/dependency-graph
(
nth
sentences-six
2
)))
(
bean
(
:relation
(
bean
(
nth
(
nth
(
vals
(
datafy
(
dl/dependency-graph
(
nth
sentences-five
1
))))
3
)
2
))))
;; This identifies "like" in this instance as being an interjection (filler word)
(
.tag
(
.getTarget
(
nth
(
nth
(
vals
(
datafy
(
dl/dependency-graph
(
nth
sentences-five
1
))))
3
)
2
)))
(
.word
(
.getTarget
(
nth
(
nth
(
vals
(
datafy
(
dl/dependency-graph
(
nth
sentences-five
1
))))
3
)
2
)))
(
.getLongName
(
.getRelation
(
nth
(
nth
(
vals
(
datafy
(
dl/dependency-graph
(
nth
sentences-five
1
))))
3
)
2
)))
(
.getShortName
(
.getRelation
(
nth
(
nth
(
vals
(
datafy
(
dl/dependency-graph
(
nth
sentences-five
1
))))
3
)
2
)))
(
defn
extract-relation
[
word
sentences
]
(
for
[
sentence
sentences
]
(
let
[
data-edges
(
vals
(
datafy
(
dl/dependency-graph
sentence
)))]
(
for
[
edge
data-edges
]
(
let
[
edge-bean
(
bean
edge
)]
edge-bean
)))))
(
extract-relation
"like"
sentences-six
)
(
for
[
sentence
sentences-six
]
(
let
[
data-edges
(
vals
(
datafy
(
dl/dependency-graph
sentence
)))]
(
for
[
edge
data-edges
]
(
let
[
edge-bean
(
bean
edge
)]
edge-bean
))))
(
datafy
(
dl/dependency-graph
(
nth
sentences-six
2
)))
(
datafy
(
first
(
keys
(
datafy
(
dl/dependency-graph
(
nth
sentences-six
2
))))))
(
nav
(
datafy
(
dl/dependency-graph
(
nth
sentences-six
2
))))
(
datafy
(
:governor
(
first
(
map
datafy
(
nth
(
vals
(
datafy
(
dl/dependency-graph
(
nth
sentences-six
0
))))
0
)))))
(
.word
(
.getSource
(
nth
(
nth
(
vals
(
datafy
(
dl/dependency-graph
(
nth
sentences-five
1
))))
3
)
2
)))
(
.word
(
nth
(
keys
(
datafy
(
dl/dependency-graph
(
nth
sentences-one
1
))))
9
))
(
.after
(
nth
(
keys
(
datafy
(
dl/dependency-graph
(
nth
sentences-one
1
))))
9
))
(
.word
(
nth
(
keys
(
datafy
(
dl/dependency-graph
(
nth
sentences-one
1
))))
10
))
(
.word
(
nth
(
keys
(
datafy
(
dl/dependency-graph
(
nth
sentences-one
1
))))
7
))
(
.after
(
nth
(
keys
(
datafy
(
dl/dependency-graph
(
nth
sentences-one
1
))))
7
))
(
.word
(
nth
(
keys
(
datafy
(
dl/dependency-graph
(
nth
sentences-two
1
))))
9
))
(
.after
(
nth
(
keys
(
datafy
(
dl/dependency-graph
(
nth
sentences-two
1
))))
9
))
(
.word
(
nth
(
keys
(
datafy
(
dl/dependency-graph
(
nth
sentences-two
1
))))
10
))
(
.tag
(
nth
(
keys
(
datafy
(
dl/dependency-graph
(
nth
sentences-two
1
))))
10
))
(
.tag
(
nth
(
keys
(
datafy
(
dl/dependency-graph
(
nth
sentences-two
1
))))
9
))
(
.tag
(
nth
(
keys
(
datafy
(
dl/dependency-graph
(
nth
sentences-two
1
))))
10
))
(
.tag
(
nth
(
keys
(
datafy
(
dl/dependency-graph
(
nth
sentences-two
1
))))
9
))
;; So two main ways, word use and grammar. Word use is simple enough,
;;just check if a word in the sentence matches the expected word
;; For example, this returns true if the word "why" is in the given sentence
(
some
#
(
=
"why"
%
)
(
dl/text
(
dl/tokens
sentence
)))
;; Then we have grammar, which is a lot harder to check. For instance, a sentence using "like"
;;as an interjection should return true when passed to the following code.
(
some
#
(
when
(
even?
%
)
%
)
'
(
1
2
3
4
))
(
def
something
[
5647
5858
76
938
62626
])
(
reduce
+
something
)
(
str
3837
345
8678
)
(
apply
str
something
)
(
map
str
something
)
(
def
testfreq
'
(
65
65
65
7
7
5
5
5
5
5
5
5
65
65
))
(
def
testfreq2
'
(
65
65
65
7
7
5
5
2
5
5
5
5
5
65
65
))
(
detect-sentence-dialect
sentences-one
)
(
detect-sentence-dialect
@
sentences-two
)
(
detect-sample-dialect
test-sentence-one
)
(
detect-sample-dialect
test-sentence-two
)
(
detect-sample-dialect
sentences-two
)
(
detect-sample-dialect
sentences-three
)
(
detect-sample-dialect
sentences-four
)
(
detect-sample-dialect
test-sentence-one
)
(
detect-sample-dialect
test-sentence-two
)
(
detect-sample-dialect
test-sentence-three
)
(
detect-sample-dialect
test-sentence-four
)
(
first
(
last
(
sort-by
val
(
frequencies
(
map
detect-sentence-dialect
sentences-one
)))))
(
first
(
last
(
sort-by
val
(
frequencies
(
map
detect-sentence-dialect
@
sentences-two
)))))
(
first
(
last
(
sort-by
val
(
frequencies
(
map
detect-sentence-dialect
sentences-three
)))))
(
detect-sentence-dialect
testfreq
)
(
detect-sentence-dialect
testfreq2
)
(
def
testfreq
'
(
"a"
"a"
"a"
"b"
"b"
"c"
"c"
"c"
"c"
"a"
"a"
"d"
"d"
))
(
frequencies
testfreq
)
(
first
(
last
(
sort-by
val
(
frequencies
testfreq
))))
(
detect-sentence-dialect
(
dl/text
(
dl/tokens
(
nth
sentences-one
1
))))
(
detect-sentence-dialect
(
dl/text
(
dl/tokens
(
nth
@
sentences-two
1
))))
(
distinct
testfreq
)
(
dedupe
testfreq
)
(
for
[
someval
something
]
(
str
someval
))
(
doseq
[
someval
something
]
(
println
someval
))
(
last
(
vals
rats
))
(
.getTarget
(
first
(
last
(
vals
rats
))))
(
bean
(
first
(
last
(
vals
rats
))))
(
bean
(
:relation
(
bean
(
first
(
last
(
vals
rats
))))))
(
->
(
vals
rats
)
(
last
)
(
first
)
(
bean
)
(
:relation
)
(
bean
))
(
->
(
vals
rats
)
(
last
)
(
first
)
(
.getTarget
))
*e
(
System/currentTimeMillis
)
(
spit
"test.txt"
"testtesttest"
)
(
spit
"test2.txt"
"{567 \"test\" 678 767}"
)
(
slurp
"test.txt"
)
(
read-string
(
slurp
"test2.txt"
))
(
clojure.edn/read-string
)
;; Don't use eval with arbitrary input
(
def
horror
(
eval
(
read-string
(
slurp
"test2.txt"
))))
horror
(
dl/text
sentences-one
)
(
dl/triples
sentences-one
)
(
dl/triples
(
dl/dependency-graph
(
nth
sentences-one
1
)))
;; Datafy the annotations. Retrieves direct annotations for every sentence.
;; Keep in mind that `dl/recur-datafy` currently doesn't work in this instance
;; and will possibly be removed in a future update:
;; https://github.com/simongray/datalinguist/issues/13
(
into
{}
(
map
datafy
@
sentences
))
(
keys
(
into
{}
(
map
datafy
@
sentences
)))
(
:natural-logic/relation-triples
(
into
{}
(
map
datafy
@
sentences
)))
(
:semantic-graph/enhanced-plus-plus-dependencies
(
into
{}
(
map
datafy
@
sentences
)))
(
:semantic-graph/collapsed-dependencies
(
into
{}
(
map
datafy
@
sentences
)))
(
:coref/mentions
(
into
{}
(
map
datafy
@
sentences
)))
(
:tokens
(
into
{}
(
map
datafy
@
sentences
)))
(
def
play-map
(
into
{}
(
map
datafy
@
sentences
)))
(
def
play-map
(
into
{}
(
datafy
@
annotated-example
)))
sentences
@
sentences
play-map
(
:text
play-map
)
#
_.
)
\ No newline at end of file
src/poalses/jonathan/dialect/main.clj
View file @
0dcbce8b
...
@@ -14,7 +14,7 @@
...
@@ -14,7 +14,7 @@
(
def
command-line-options
(
def
command-line-options
"Command line options parsing rules."
"Command line options parsing rules."
[[
"-l"
"--log-level LEVEL"
(
str
"Logging level "
(
seq
log-levels
))
[[
"-l"
"--log-level LEVEL"
(
str
"Logging level "
(
seq
log-levels
))
:default
:
error
:default
:
info
:parse-fn
#
(
keyword
(
string/join
(
rest
%
)))
:parse-fn
#
(
keyword
(
string/join
(
rest
%
)))
:validate
[
#
(
contains?
log-levels
%
)
:validate
[
#
(
contains?
log-levels
%
)
(
str
"Must be one of: "
(
seq
log-levels
))]]
(
str
"Must be one of: "
(
seq
log-levels
))]]
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment