Commit 0dcbce8b authored by Jonathan Poalses's avatar Jonathan Poalses

added sample data and sample expected for ml stuff

parent 70a83fdb
(ns poalses.jonathan.dialect.dialect-nlp
(:require [clojure.datafy :refer [datafy]]
[dk.simongray.datalinguist :as dl]
[dk.simongray.datalinguist.triple :refer [triple->datalog]])
(:import [edu.stanford.nlp.coref CorefCoreAnnotations$CorefChainAnnotation]))
(:require [dk.simongray.datalinguist :as dl]))
(def nlp
(dl/->pipeline {:annotators ["truecase"
"quote"
"entitymentions"
"parse"
"depparse"
"lemma"
"relation"
(dl/->pipeline {:annotators [
"tokenize"
"coref"
"openie"
"ner"]
:quote {:extractUnclosedQuotes "true"}}))
"ner"]}))
;; Word sets that will show a sentence as being of that dialect
......@@ -64,20 +52,9 @@
(cond)
;; Take a text sample and separate it into its sentences, then for each sentence find its dialects, and return the most common dialect
;; A sentence can have an indeterminate number of dialects associated with it, as detect-sentence-dialects can return a collection,
;;when no dialect can be detected it defaults to standard. IE if there's a sample with 3 sentences, one reads as scottish,
;;one reads as scottish and australian, and the last reads as nothing, it will return a collection containing 2 scottish keys,
;;one australian key, and one standard key, meaning it would be seen as a scottish sample.
(defn detect-sample-dialect [sample]
(first (last (sort-by val (frequencies (flatten (map detect-sentence-dialect (dl/sentences (nlp sample)))))))))
;; slurp a file containing a map of all the samples and their expected dialect
;; detect each sample, collecting the results, and comparing against the expected
;; output the results into both a file, as a map of the samples and if it was successful or not,
;; and onto the command line as a fraction and percentage, with a breakdown per dialect
(defn detect-dialects
[filename]
(let [filedata (clojure.edn/read-string (slurp filename))
......@@ -86,278 +63,3 @@
(let [result-comparison (map zero? (map compare key-results expected-results))]
(println (str (count (filter true? result-comparison)) "/" (count result-comparison) " correct.")))))
(defn generate-machine-data []
(let [filedata (clojure.edn/read-string (slurp "all_samples2.edn"))
data (str (keys filedata))
value (str (vals filedata))]
(spit "sample_data.txt" data)
(spit "sample_expected.txt" value)))
(comment
(def filedata (clojure.edn/read-string (slurp "all_samples2.edn")))
(def key-results (map detect-sample-dialect (keys filedata)))
(def expected-results (map keyword (vals filedata)))
(def datayaya (str (keys filedata)))
(def whyyaya (str (vals filedata)))
(def)
(println datayaya)
(println whyyaya)
(spit "sample_data.txt" datayaya)
(spit "sample_expected.txt" whyyaya)
(interleave key-results (map zero? (map compare key-results expected-results)))
(zipmap (keys filedata) (map zero? (map compare key-results expected-results)))
;; Test every annotator in the pipeline
(map dl/true-case @sentences)
(map dl/quotations @sentences)
(map dl/mentions @sentences)
(map dl/annotation "relation" @sentences)
(map dl/constituency-tree @sentences)
(map dl/constituency-tree sentences-one)
(map dl/constituency-tree @sentences-two)
(map dl/dependency-graph @sentences)
(datafy (dl/dependency-graph (nth sentences-one 1)))
(datafy (dl/dependency-graph (nth @sentences-two 1)))
(:leafVertices (bean (dl/dependency-graph (nth @sentences-two 1))))
(map dl/dependency-graph @sentences-two)
(bean (dl/dependency-graph @sentences-two))
(map dl/lemma @sentences)
(map dl/lemma sentences-one)
(map dl/lemma @sentences-two)
(dl/text (dl/tokens (nth sentences-one 1)))
(map dl/tokens @sentences-two)
(->> (mapcat dl/triples @sentences) (map triple->datalog))
(dl/annotation CorefCoreAnnotations$CorefChainAnnotation @annotated-example)
(show-dependencies)
(def rats (datafy (dl/dependency-graph (nth sentences-one 1))))
(first (last (sort-by val (frequencies (flatten (map detect-sentence-dialect (dl/sentences (nlp test-sentence-one))))))))
(first (last (sort-by val (frequencies (flatten (map detect-sentence-dialect (dl/sentences (nlp test-sentence-two))))))))
(first (last (sort-by val (frequencies (flatten (map detect-sentence-dialect (dl/sentences (nlp test-sentence-three))))))))
(first (last (sort-by val (frequencies (flatten (map detect-sentence-dialect (dl/sentences (nlp test-sentence-four))))))))
(first (last (sort-by val (frequencies (flatten (map detect-sentence-dialect (dl/sentences (nlp test-sentence-five))))))))
(first (last (sort-by val (frequencies (flatten (map detect-sentence-dialect (dl/sentences (nlp test-sentence-six))))))))
(first (last (sort-by val (frequencies (flatten (map detect-sentence-dialect (dl/sentences (nlp test-sentence-seven))))))))
(last (vals rats))
(.getTarget (first (last (vals rats))))
(bean (first (last (vals rats))))
(bean (:relation (bean (first (last (vals rats))))))
(datafy (dl/dependency-graph (nth sentences-one 1)))
(datafy (dl/dependency-graph (nth sentences-five 1)))
(datafy (dl/dependency-graph (nth sentences-six 0)))
(datafy (dl/dependency-graph (nth sentences-six 1)))
(datafy (dl/dependency-graph (nth sentences-six 2)))
(bean (:relation (bean (nth (nth (vals (datafy (dl/dependency-graph (nth sentences-five 1)))) 3) 2))))
;; This identifies "like" in this instance as being an interjection (filler word)
(.tag (.getTarget (nth (nth (vals (datafy (dl/dependency-graph (nth sentences-five 1)))) 3) 2)))
(.word (.getTarget (nth (nth (vals (datafy (dl/dependency-graph (nth sentences-five 1)))) 3) 2)))
(.getLongName (.getRelation (nth (nth (vals (datafy (dl/dependency-graph (nth sentences-five 1)))) 3) 2)))
(.getShortName (.getRelation (nth (nth (vals (datafy (dl/dependency-graph (nth sentences-five 1)))) 3) 2)))
(defn extract-relation [word sentences]
(for [sentence sentences]
(let [data-edges (vals (datafy (dl/dependency-graph sentence)))]
(for [edge data-edges]
(let [edge-bean (bean edge)]
edge-bean)))))
(extract-relation "like" sentences-six)
(for [sentence sentences-six]
(let [data-edges (vals (datafy (dl/dependency-graph sentence)))]
(for [edge data-edges]
(let [edge-bean (bean edge)]
edge-bean))))
(datafy (dl/dependency-graph (nth sentences-six 2)))
(datafy (first (keys (datafy (dl/dependency-graph (nth sentences-six 2))))))
(nav (datafy (dl/dependency-graph (nth sentences-six 2))))
(datafy (:governor (first (map datafy (nth (vals (datafy (dl/dependency-graph (nth sentences-six 0)))) 0)))))
(.word (.getSource (nth (nth (vals (datafy (dl/dependency-graph (nth sentences-five 1)))) 3) 2)))
(.word (nth (keys (datafy (dl/dependency-graph (nth sentences-one 1)))) 9))
(.after (nth (keys (datafy (dl/dependency-graph (nth sentences-one 1)))) 9))
(.word (nth (keys (datafy (dl/dependency-graph (nth sentences-one 1)))) 10))
(.word (nth (keys (datafy (dl/dependency-graph (nth sentences-one 1)))) 7))
(.after (nth (keys (datafy (dl/dependency-graph (nth sentences-one 1)))) 7))
(.word (nth (keys (datafy (dl/dependency-graph (nth sentences-two 1)))) 9))
(.after (nth (keys (datafy (dl/dependency-graph (nth sentences-two 1)))) 9))
(.word (nth (keys (datafy (dl/dependency-graph (nth sentences-two 1)))) 10))
(.tag (nth (keys (datafy (dl/dependency-graph (nth sentences-two 1)))) 10))
(.tag (nth (keys (datafy (dl/dependency-graph (nth sentences-two 1)))) 9))
(.tag (nth (keys (datafy (dl/dependency-graph (nth sentences-two 1)))) 10))
(.tag (nth (keys (datafy (dl/dependency-graph (nth sentences-two 1)))) 9))
;; So two main ways, word use and grammar. Word use is simple enough,
;;just check if a word in the sentence matches the expected word
;; For example, this returns true if the word "why" is in the given sentence
(some #(= "why" %) (dl/text (dl/tokens sentence)))
;; Then we have grammar, which is a lot harder to check. For instance, a sentence using "like"
;;as an interjection should return true when passed to the following code.
(some #(when (even? %) %) '(1 2 3 4))
(def something [5647 5858 76 938 62626])
(reduce + something)
(str 3837 345 8678)
(apply str something)
(map str something)
(def testfreq '(65 65 65 7 7 5 5 5 5 5 5 5 65 65))
(def testfreq2 '(65 65 65 7 7 5 5 2 5 5 5 5 5 65 65))
(detect-sentence-dialect sentences-one)
(detect-sentence-dialect @sentences-two)
(detect-sample-dialect test-sentence-one)
(detect-sample-dialect test-sentence-two)
(detect-sample-dialect sentences-two)
(detect-sample-dialect sentences-three)
(detect-sample-dialect sentences-four)
(detect-sample-dialect test-sentence-one)
(detect-sample-dialect test-sentence-two)
(detect-sample-dialect test-sentence-three)
(detect-sample-dialect test-sentence-four)
(first (last (sort-by val (frequencies (map detect-sentence-dialect sentences-one)))))
(first (last (sort-by val (frequencies (map detect-sentence-dialect @sentences-two)))))
(first (last (sort-by val (frequencies (map detect-sentence-dialect sentences-three)))))
(detect-sentence-dialect testfreq)
(detect-sentence-dialect testfreq2)
(def testfreq '("a" "a" "a" "b" "b" "c" "c" "c" "c" "a" "a" "d" "d"))
(frequencies testfreq)
(first (last (sort-by val (frequencies testfreq))))
(detect-sentence-dialect (dl/text (dl/tokens (nth sentences-one 1))))
(detect-sentence-dialect (dl/text (dl/tokens (nth @sentences-two 1))))
(distinct testfreq)
(dedupe testfreq)
(for [someval something]
(str someval))
(doseq [someval something]
(println someval))
(last (vals rats))
(.getTarget (first (last (vals rats))))
(bean (first (last (vals rats))))
(bean (:relation (bean (first (last (vals rats))))))
(-> (vals rats)
(last)
(first)
(bean)
(:relation)
(bean))
(-> (vals rats)
(last)
(first)
(.getTarget))
*e
(System/currentTimeMillis)
(spit "test.txt" "testtesttest")
(spit "test2.txt" "{567 \"test\" 678 767}")
(slurp "test.txt")
(read-string (slurp "test2.txt"))
(clojure.edn/read-string)
;; Don't use eval with arbitrary input
(def horror (eval (read-string (slurp "test2.txt"))))
horror
(dl/text sentences-one)
(dl/triples sentences-one)
(dl/triples (dl/dependency-graph (nth sentences-one 1)))
;; Datafy the annotations. Retrieves direct annotations for every sentence.
;; Keep in mind that `dl/recur-datafy` currently doesn't work in this instance
;; and will possibly be removed in a future update:
;; https://github.com/simongray/datalinguist/issues/13
(into {} (map datafy @sentences))
(keys (into {} (map datafy @sentences)))
(:natural-logic/relation-triples (into {} (map datafy @sentences)))
(:semantic-graph/enhanced-plus-plus-dependencies (into {} (map datafy @sentences)))
(:semantic-graph/collapsed-dependencies (into {} (map datafy @sentences)))
(:coref/mentions (into {} (map datafy @sentences)))
(:tokens (into {} (map datafy @sentences)))
(def play-map (into {} (map datafy @sentences)))
(def play-map (into {} (datafy @annotated-example)))
sentences
@sentences
play-map
(:text play-map)
#_.)
\ No newline at end of file
......@@ -14,7 +14,7 @@
(def command-line-options
"Command line options parsing rules."
[["-l" "--log-level LEVEL" (str "Logging level " (seq log-levels))
:default :error
:default :info
:parse-fn #(keyword (string/join (rest %)))
:validate [#(contains? log-levels %)
(str "Must be one of: " (seq log-levels))]]
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment