added sample data and sample expected for ml stuff

0dcbce8b · Jonathan Poalses · 70a83fdb · 0dcbce8b · 0dcbce8b
Commit 0dcbce8b authored May 18, 2023 by Jonathan Poalses
Hide whitespace changes
Inline Side-by-side

Showing with 4 additions and 302 deletions

dialect_nlp.clj src/poalses/jonathan/dialect/dialect_nlp.clj +3 -301

main.clj src/poalses/jonathan/dialect/main.clj +1 -1

No files found.
--- a/src/poalses/jonathan/dialect/dialect_nlp.clj
+++ b/src/poalses/jonathan/dialect/dialect_nlp.clj
 (ns poalses.jonathan.dialect.dialect-nlp
-  (:require [clojure.datafy :refer [datafy]]
+  (:require [dk.simongray.datalinguist :as dl]))
-            [dk.simongray.datalinguist :as dl]
-            [dk.simongray.datalinguist.triple :refer [triple->datalog]])
-  (:import [edu.stanford.nlp.coref CorefCoreAnnotations$CorefChainAnnotation]))
 (def nlp
-  (dl/->pipeline {:annotators ["truecase"
+  (dl/->pipeline {:annotators [
-                               "quote"
-                               "entitymentions"
-                               "parse"
-                               "depparse"
-                               "lemma"
-                               "relation"
                               "tokenize"
-                               "coref"
+                               "ner"]}))
-                               "openie"
-                               "ner"]
-                  :quote      {:extractUnclosedQuotes "true"}}))
 ;; Word sets that will show a sentence as being of that dialect
@@ -64,20 +52,9 @@
 (cond)
-;; Take a text sample and separate it into its sentences, then for each sentence find its dialects, and return the most common dialect
-;; A sentence can have an indeterminate number of dialects associated with it, as detect-sentence-dialects can return a collection,
-;;when no dialect can be detected it defaults to standard. IE if there's a sample with 3 sentences, one reads as scottish,
-;;one reads as scottish and australian, and the last reads as nothing, it will return a collection containing 2 scottish keys,
-;;one australian key, and one standard key, meaning it would be seen as a scottish sample.
 (defn detect-sample-dialect [sample]
  (first (last (sort-by val (frequencies (flatten (map detect-sentence-dialect (dl/sentences (nlp sample)))))))))
-;; slurp a file containing a map of all the samples and their expected dialect
-;; detect each sample, collecting the results, and comparing against the expected
-;; output the results into both a file, as a map of the samples and if it was successful or not,
-;; and onto the command line as a fraction and percentage, with a breakdown per dialect
 (defn detect-dialects
  [filename]
  (let [filedata (clojure.edn/read-string (slurp filename))
@@ -86,278 +63,3 @@
    (let [result-comparison (map zero? (map compare key-results expected-results))]
      (println (str (count (filter true? result-comparison)) "/" (count result-comparison) " correct.")))))
-(defn generate-machine-data []
-  (let [filedata (clojure.edn/read-string (slurp "all_samples2.edn"))
-        data (str (keys filedata))
-        value (str (vals filedata))]
-    (spit "sample_data.txt" data)
-    (spit "sample_expected.txt" value)))
-(comment
-  (def filedata (clojure.edn/read-string (slurp "all_samples2.edn")))
-  (def key-results (map detect-sample-dialect (keys filedata)))
-  (def expected-results (map keyword (vals filedata)))
-  (def datayaya  (str (keys filedata)))
-  (def whyyaya  (str (vals filedata)))
-  (def)
-  (println datayaya)
-  (println whyyaya)
-  (spit "sample_data.txt" datayaya)
-  (spit "sample_expected.txt" whyyaya)
-  (interleave key-results (map zero? (map compare key-results expected-results)))
-  (zipmap (keys filedata) (map zero? (map compare key-results expected-results)))
-  ;; Test every annotator in the pipeline
-  (map dl/true-case @sentences)
-  (map dl/quotations @sentences)
-  (map dl/mentions @sentences)
-  (map dl/annotation "relation" @sentences)
-  (map dl/constituency-tree @sentences)
-  (map dl/constituency-tree sentences-one)
-  (map dl/constituency-tree @sentences-two)
-  (map dl/dependency-graph @sentences)
-  (datafy (dl/dependency-graph (nth sentences-one 1)))
-  (datafy (dl/dependency-graph (nth @sentences-two 1)))
-  (:leafVertices (bean (dl/dependency-graph (nth @sentences-two 1))))
-  (map dl/dependency-graph @sentences-two)
-  (bean (dl/dependency-graph @sentences-two))
-  (map dl/lemma @sentences)
-  (map dl/lemma sentences-one)
-  (map dl/lemma @sentences-two)
-  (dl/text (dl/tokens (nth sentences-one 1)))
-  (map dl/tokens @sentences-two)
-  (->> (mapcat dl/triples @sentences) (map triple->datalog))
-  (dl/annotation CorefCoreAnnotations$CorefChainAnnotation @annotated-example)
-  (show-dependencies)
-  (def rats (datafy (dl/dependency-graph (nth sentences-one 1))))
-  (first (last (sort-by val (frequencies (flatten (map detect-sentence-dialect (dl/sentences (nlp test-sentence-one))))))))
-  (first (last (sort-by val (frequencies (flatten (map detect-sentence-dialect (dl/sentences (nlp test-sentence-two))))))))
-  (first (last (sort-by val (frequencies (flatten (map detect-sentence-dialect (dl/sentences (nlp test-sentence-three))))))))
-  (first (last (sort-by val (frequencies (flatten (map detect-sentence-dialect (dl/sentences (nlp test-sentence-four))))))))
-  (first (last (sort-by val (frequencies (flatten (map detect-sentence-dialect (dl/sentences (nlp test-sentence-five))))))))
-  (first (last (sort-by val (frequencies (flatten (map detect-sentence-dialect (dl/sentences (nlp test-sentence-six))))))))
-  (first (last (sort-by val (frequencies (flatten (map detect-sentence-dialect (dl/sentences (nlp test-sentence-seven))))))))
-  (last (vals rats))
-  (.getTarget (first (last (vals rats))))
-  (bean (first (last (vals rats))))
-  (bean (:relation (bean (first (last (vals rats))))))
-  (datafy (dl/dependency-graph (nth sentences-one 1)))
-  (datafy (dl/dependency-graph (nth sentences-five 1)))
-  (datafy (dl/dependency-graph (nth sentences-six 0)))
-  (datafy (dl/dependency-graph (nth sentences-six 1)))
-  (datafy (dl/dependency-graph (nth sentences-six 2)))
-  (bean (:relation (bean (nth (nth (vals (datafy (dl/dependency-graph (nth sentences-five 1)))) 3) 2))))
-  ;; This identifies "like" in this instance as being an interjection (filler word)
-  (.tag (.getTarget (nth (nth (vals (datafy (dl/dependency-graph (nth sentences-five 1)))) 3) 2)))
-  (.word (.getTarget (nth (nth (vals (datafy (dl/dependency-graph (nth sentences-five 1)))) 3) 2)))
-  (.getLongName (.getRelation (nth (nth (vals (datafy (dl/dependency-graph (nth sentences-five 1)))) 3) 2)))
-  (.getShortName (.getRelation (nth (nth (vals (datafy (dl/dependency-graph (nth sentences-five 1)))) 3) 2)))
-  (defn extract-relation [word sentences]
-    (for [sentence sentences]
-      (let [data-edges (vals (datafy (dl/dependency-graph sentence)))]
-        (for [edge data-edges]
-          (let [edge-bean (bean edge)]
-            edge-bean)))))
-  (extract-relation "like" sentences-six)
-  (for [sentence sentences-six]
-    (let [data-edges (vals (datafy (dl/dependency-graph sentence)))]
-      (for [edge data-edges]
-        (let [edge-bean (bean edge)]
-          edge-bean))))
-  (datafy (dl/dependency-graph (nth sentences-six 2)))
-  (datafy (first (keys (datafy (dl/dependency-graph (nth sentences-six 2))))))
-  (nav (datafy (dl/dependency-graph (nth sentences-six 2))))
-  (datafy (:governor (first (map datafy (nth (vals (datafy (dl/dependency-graph (nth sentences-six 0)))) 0)))))
-  (.word (.getSource (nth (nth (vals (datafy (dl/dependency-graph (nth sentences-five 1)))) 3) 2)))
-  (.word (nth (keys (datafy (dl/dependency-graph (nth sentences-one 1)))) 9))
-  (.after (nth (keys (datafy (dl/dependency-graph (nth sentences-one 1)))) 9))
-  (.word (nth (keys (datafy (dl/dependency-graph (nth sentences-one 1)))) 10))
-  (.word (nth (keys (datafy (dl/dependency-graph (nth sentences-one 1)))) 7))
-  (.after (nth (keys (datafy (dl/dependency-graph (nth sentences-one 1)))) 7))
-  (.word (nth (keys (datafy (dl/dependency-graph (nth sentences-two 1)))) 9))
-  (.after (nth (keys (datafy (dl/dependency-graph (nth sentences-two 1)))) 9))
-  (.word (nth (keys (datafy (dl/dependency-graph (nth sentences-two 1)))) 10))
-  (.tag (nth (keys (datafy (dl/dependency-graph (nth sentences-two 1)))) 10))
-  (.tag (nth (keys (datafy (dl/dependency-graph (nth sentences-two 1)))) 9))
-  (.tag (nth (keys (datafy (dl/dependency-graph (nth sentences-two 1)))) 10))
-  (.tag (nth (keys (datafy (dl/dependency-graph (nth sentences-two 1)))) 9))
-  ;; So two main ways, word use and grammar. Word use is simple enough,
-  ;;just check if a word in the sentence matches the expected word
-  ;; For example, this returns true if the word "why" is in the given sentence
-  (some #(= "why" %) (dl/text (dl/tokens sentence)))
-  ;; Then we have grammar, which is a lot harder to check. For instance, a sentence using "like"
-  ;;as an interjection should return true when passed to the following code.
-  (some #(when (even? %) %) '(1 2 3 4))
-  (def something [5647 5858 76 938 62626])
-  (reduce + something)
-  (str 3837 345 8678)
-  (apply str something)
-  (map str something)
-  (def testfreq '(65 65 65 7 7 5 5 5 5 5 5 5 65 65))
-  (def testfreq2 '(65 65 65 7 7 5 5 2 5 5 5 5 5 65 65))
-  (detect-sentence-dialect sentences-one)
-  (detect-sentence-dialect @sentences-two)
-  (detect-sample-dialect test-sentence-one)
-  (detect-sample-dialect test-sentence-two)
-  (detect-sample-dialect sentences-two)
-  (detect-sample-dialect sentences-three)
-  (detect-sample-dialect sentences-four)
-  (detect-sample-dialect test-sentence-one)
-  (detect-sample-dialect test-sentence-two)
-  (detect-sample-dialect test-sentence-three)
-  (detect-sample-dialect test-sentence-four)
-  (first (last (sort-by val (frequencies (map detect-sentence-dialect sentences-one)))))
-  (first (last (sort-by val (frequencies (map detect-sentence-dialect @sentences-two)))))
-  (first (last (sort-by val (frequencies (map detect-sentence-dialect sentences-three)))))
-  (detect-sentence-dialect testfreq)
-  (detect-sentence-dialect testfreq2)
-  (def testfreq '("a" "a" "a" "b" "b" "c" "c" "c" "c" "a" "a" "d" "d"))
-  (frequencies testfreq)
-  (first (last (sort-by val (frequencies testfreq))))
-  (detect-sentence-dialect (dl/text (dl/tokens (nth sentences-one 1))))
-  (detect-sentence-dialect (dl/text (dl/tokens (nth @sentences-two 1))))
-  (distinct testfreq)
-  (dedupe testfreq)
-  (for [someval something]
-    (str someval))
-  (doseq [someval something]
-    (println someval))
-  (last (vals rats))
-  (.getTarget (first (last (vals rats))))
-  (bean (first (last (vals rats))))
-  (bean (:relation (bean (first (last (vals rats))))))
-  (-> (vals rats)
-      (last)
-      (first)
-      (bean)
-      (:relation)
-      (bean))
-  (-> (vals rats)
-      (last)
-      (first)
-      (.getTarget))
-  *e
-  (System/currentTimeMillis)
-  (spit "test.txt" "testtesttest")
-  (spit "test2.txt" "{567 \"test\" 678 767}")
-  (slurp "test.txt")
-  (read-string (slurp "test2.txt"))
-  (clojure.edn/read-string)
-  ;; Don't use eval with arbitrary input
-  (def horror (eval (read-string (slurp "test2.txt"))))
-  horror
-  (dl/text sentences-one)
-  (dl/triples sentences-one)
-  (dl/triples (dl/dependency-graph (nth sentences-one 1)))
-  ;; Datafy the annotations. Retrieves direct annotations for every sentence.
-  ;; Keep in mind that `dl/recur-datafy` currently doesn't work in this instance
-  ;; and will possibly be removed in a future update:
-  ;;   https://github.com/simongray/datalinguist/issues/13
-  (into {} (map datafy @sentences))
-  (keys (into {} (map datafy @sentences)))
-  (:natural-logic/relation-triples (into {} (map datafy @sentences)))
-  (:semantic-graph/enhanced-plus-plus-dependencies (into {} (map datafy @sentences)))
-  (:semantic-graph/collapsed-dependencies (into {} (map datafy @sentences)))
-  (:coref/mentions (into {} (map datafy @sentences)))
-  (:tokens (into {} (map datafy @sentences)))
-  (def play-map (into {} (map datafy @sentences)))
-  (def play-map (into {} (datafy @annotated-example)))
-  sentences
-  @sentences
-  play-map
-  (:text play-map)
-  #_.)
\ No newline at end of file
--- a/src/poalses/jonathan/dialect/main.clj
+++ b/src/poalses/jonathan/dialect/main.clj
@@ -14,7 +14,7 @@
 (def command-line-options
  "Command line options parsing rules."
   [["-l" "--log-level LEVEL" (str "Logging level " (seq log-levels))
-     :default :error
+     :default :info
     :parse-fn #(keyword (string/join (rest %)))
     :validate [#(contains? log-levels %)
                (str "Must be one of: " (seq log-levels))]]