I prefer the project-based solution than the interactive solution is because it can have Visual Studio's watch, immediate windows, and debug visualizer. Once I got the information into a comfortable environment, it will be easy to move forward.
The 200-line code is to build up a structure like the following:
I call multiple sentences a story. A story contains (1) sentences and (2) cross-references. The sentence structure shows NLP info about a sentence. The sentence includes a token list, tree, and dependency graph. The cross-references maintain the relationship among elements from different sentences. The first file is the main file. It shows how to invoke the underlying functions and show the structures.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 | // Learn more about F# at http://fsharp.org // See the 'F# Tutorial' project for more help. open System open System.IO open java.util open java.io open edu.stanford.nlp.pipeline open edu.stanford.nlp.ling open Utils.NLPUtils open Utils.NLPExtensions open edu.stanford.nlp.util open edu.stanford.nlp.trees open edu.stanford.nlp.semgraph open Utils.NLPStructures open edu.stanford.nlp.coref [<EntryPoint>] let main argv = let text = "Kosgi Santosh sent an email to Stanford University. He didn't get a reply email."; // Annotation pipeline configuration let props = Properties() props.setProperty("annotators","tokenize, ssplit, pos, lemma, ner, parse, dcoref") |> ignore props.setProperty("ner.useSUTime","0") |> ignore let pipeline = StanfordCoreNLP(props) // Annotation let annotation = Annotation(text) pipeline.annotate(annotation) //get annotation info let keys = annotation.GetToken<HashMap>(typeof<CorefCoreAnnotations.CorefChainAnnotation>) let mentions = keys |> Seq.exactlyOne |> getMentions let sentences = [ let sentences = annotation.GetToken<CoreMap>(typeof<CoreAnnotations.SentencesAnnotation>) for s in sentences do let tokens = s.GetToken<CoreLabel>(typeof<CoreAnnotations.TokensAnnotation>) let words = getWords tokens let t = s.GetToken<Tree>(typeof<TreeCoreAnnotations.TreeAnnotation>) let tree = t |> Seq.exactlyOne |> buildTree words let deps = s.GetToken<SemanticGraph>(typeof<SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation>) let relationships = deps |> Seq.exactlyOne |> getDependencyGraph words let sentence = { Words = words; Dependency = relationships; Tree = tree; } yield sentence ] let story = { CrossLinks = mentions; Sentences = sentences; } printfn "%O" story 0 // return an integer exit code |
The second file is the library file. I do not think the structure will stay same after two weeks. I might decide to add more fields. But currently the foundation is there.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 | namespace Utils module NLPStructures = type Word = string type Ner = string type POS = string type Index = int type Relationship = string type Span = int * int type Head = int * string type SentenceIndex = int type WordType = { Word : Word Ner: Ner POS: POS Index: Index } member this.IsSame word = match this with | { Word = w; } -> w = word member this.IsSame index = match this with | { Index = i } -> i = index type MentionEntity = { Index: Index Relationship: Relationship Span : Span Head : Head SentenceIndex : SentenceIndex } type RepresentiveMention = MentionEntity type CrossLinkType = { Index: Index RepresentiveMention : RepresentiveMention Mentions : MentionEntity list } type DependencyGraph = | Link of Relationship * WordType * WordType | CrossLink of CrossLinkType type TreeNode = | Node of string * WordType | SubNodes of POS * TreeNode list type SentenceType = { Words: WordType list Dependency: DependencyGraph list Tree: TreeNode } type StoryType = { CrossLinks : CrossLinkType list Sentences : SentenceType list } module NLPUtils = open edu.stanford.nlp.trees open edu.stanford.nlp.ling open edu.stanford.nlp.semgraph open NLPStructures let toEnumerable<'T> (obj:obj) = let l = obj :?> java.util.ArrayList l |> Seq.cast<'T> let toJavaClass (t:System.Type) = java.lang.Class.op_Implicit(t) let findWord (words:WordType list) (word:Word) = words |> Seq.find (fun n -> n.IsSame(word)) let findIndex (words:WordType list) (i:Index) = words |> Seq.find (fun n -> n.IsSame(i)) let inline getObjFromMap (x:^T) t = let key = t |> toJavaClass (^T : (member get : java.lang.Class -> obj) (x, key) ) let getWords (tokens:seq<CoreLabel>) = [ for token in tokens do let word = typeof<CoreAnnotations.TextAnnotation> |> getObjFromMap token :?> Word let pos = typeof<CoreAnnotations.PartOfSpeechAnnotation> |> getObjFromMap token :?> POS let ner = typeof<CoreAnnotations.NamedEntityTagAnnotation> |> getObjFromMap token :?> Ner let index = token.index() let word = { Word = word; Ner = ner; POS = pos; Index = index } yield word ] let getDependencyGraph words (deps:SemanticGraph) = [ for edge in deps.edgeListSorted().toArray() |> Seq.cast<SemanticGraphEdge> do let gov = edge.getGovernor() let dep = edge.getDependent() let govEntity = findIndex words (gov.index()) let depEntity = findIndex words (dep.index()) let e = Link(edge.getRelation().getLongName(), govEntity, depEntity) yield e ] let rec buildTree words (tree:Tree) = let label = tree.value() let children = tree.children() if children.Length = 0 then let x = tree.label() :?> CoreLabel let i = x.index() let entity = findIndex words i Node(label, entity) else let nodes = children |> Seq.map (fun tree -> buildTree words tree) |> Seq.toList SubNodes(label, nodes) let getMention (mention:edu.stanford.nlp.coref.data.CorefChain.CorefMention) = let mentionId = mention.mentionID let span = (mention.startIndex, mention.endIndex) let relation = mention.animacy.name() let head = (mention.headIndex, mention.mentionSpan) let sentenceIndex = mention.sentNum let m = { Index = mentionId; Relationship = relation; Span = span; Head = head; SentenceIndex = sentenceIndex; } m let getMentions (keys:java.util.HashMap) = [ for key in keys.keySet().toArray() do let v = keys.get(key) :?> edu.stanford.nlp.coref.data.CorefChain let representiveMention = v.getRepresentativeMention() let m = getMention(representiveMention) let index = v.getChainID() let mentions = v.getMentionsInTextualOrder().toArray() let ms = mentions |> Seq.cast<edu.stanford.nlp.coref.data.CorefChain.CorefMention> |> Seq.map getMention |> Seq.toList let r = { Index = index; RepresentiveMention = m; Mentions = ms; } yield r ] let returnSeq<'T> (x:obj) = if x :? java.util.ArrayList then toEnumerable<'T> x else Seq.singleton (x :?> 'T) module NLPExtensions = open NLPUtils open edu.stanford.nlp.util type CoreMap with member this.GetToken<'T> (t:System.Type) = t |> getObjFromMap this |> returnSeq<'T> |
No comments:
Post a Comment