Pages

Sunday, November 11, 2018

Move NLP IKVM to F# .NET-Friendly

Since the last post, I read Sergey's code. Then I decided to work on refactoring the code to store the data into the .NET and F# format. Stanford NLP does provide a server. I still want to make it .Net friendly and also get myself familiar with the NLP core.

I prefer the project-based solution than the interactive solution is because it can have Visual Studio's watch, immediate windows, and debug visualizer. Once I got the information into a comfortable environment, it will be easy to move forward. 

The 200-line code is to build up a structure like the following:

I call multiple sentences a story. A story contains (1) sentences and (2) cross-references. The sentence structure shows NLP info about a sentence. The sentence includes a token list, tree, and dependency graph. The cross-references maintain the relationship among elements from different sentences. The first file is the main file. It shows how to invoke the underlying functions and show the structures. 


1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
// Learn more about F# at http://fsharp.org
// See the 'F# Tutorial' project for more help.

open System
open System.IO
open java.util
open java.io
open edu.stanford.nlp.pipeline
open edu.stanford.nlp.ling
open Utils.NLPUtils
open Utils.NLPExtensions
open edu.stanford.nlp.util
open edu.stanford.nlp.trees
open edu.stanford.nlp.semgraph
open Utils.NLPStructures
open edu.stanford.nlp.coref

[<EntryPoint>]
let main argv = 
    let text = "Kosgi Santosh sent an email to Stanford University. He didn't get a reply email.";

    // Annotation pipeline configuration
    let props = Properties()
    props.setProperty("annotators","tokenize, ssplit, pos, lemma, ner, parse, dcoref") |> ignore
    props.setProperty("ner.useSUTime","0") |> ignore

    let pipeline = StanfordCoreNLP(props)

    // Annotation
    let annotation = Annotation(text)
    pipeline.annotate(annotation)

    //get annotation info
    let keys = annotation.GetToken<HashMap>(typeof<CorefCoreAnnotations.CorefChainAnnotation>)
    let mentions = keys |> Seq.exactlyOne |> getMentions        

    let sentences = 
        [
            let sentences = annotation.GetToken<CoreMap>(typeof<CoreAnnotations.SentencesAnnotation>)

            for s in sentences do
                let tokens = s.GetToken<CoreLabel>(typeof<CoreAnnotations.TokensAnnotation>)
                let words = getWords tokens

                let t = s.GetToken<Tree>(typeof<TreeCoreAnnotations.TreeAnnotation>)  
                let tree = t |> Seq.exactlyOne |> buildTree words
       
                let deps = s.GetToken<SemanticGraph>(typeof<SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation>)
                let relationships = deps |> Seq.exactlyOne |> getDependencyGraph words
                let sentence = { Words = words; Dependency = relationships; Tree = tree; }
                yield sentence
            ]

    let story = 
        {
            CrossLinks = mentions;
            Sentences = sentences;
        }

    printfn "%O" story

    0 // return an integer exit code


The second file is the library file. I do not think the structure will stay same after two weeks. I might decide to add more fields. But currently the foundation is there.



1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
namespace Utils

module NLPStructures = 

    type Word = string
    type Ner = string
    type POS = string
    type Index = int
    type Relationship = string
    type Span = int * int
    type Head = int * string
    type SentenceIndex = int
    
    type WordType = 
        {
          Word : Word
          Ner: Ner
          POS: POS
          Index: Index
        }

        member this.IsSame word = 
            match this with
            | { Word = w; } -> w = word

        member this.IsSame index = 
            match this with
            | { Index = i } -> i = index

    type MentionEntity =
            {
                Index: Index
                Relationship: Relationship
                Span : Span
                Head : Head
                SentenceIndex : SentenceIndex
            }
    
    type RepresentiveMention = MentionEntity

    type CrossLinkType =
        {
            Index: Index
            RepresentiveMention : RepresentiveMention
            Mentions : MentionEntity list
        }
    
    type DependencyGraph = 
        | Link of Relationship * WordType * WordType
        | CrossLink of CrossLinkType
        
    type TreeNode = 
        | Node of string * WordType
        | SubNodes of POS * TreeNode list
    
    type SentenceType =
        {
            Words: WordType list
            Dependency: DependencyGraph list
            Tree: TreeNode
        }

    type StoryType = 
        {
            CrossLinks : CrossLinkType list
            Sentences : SentenceType list
        }

module NLPUtils =
    open edu.stanford.nlp.trees
    open edu.stanford.nlp.ling
    open edu.stanford.nlp.semgraph

    open NLPStructures

    let toEnumerable<'T> (obj:obj) = 
        let l = obj :?> java.util.ArrayList
        l |> Seq.cast<'T>    
            
    let toJavaClass (t:System.Type) = java.lang.Class.op_Implicit(t)

    let findWord (words:WordType list) (word:Word) = 
        words |> Seq.find (fun n -> n.IsSame(word))
    let findIndex (words:WordType list) (i:Index) = 
        words |> Seq.find (fun n -> n.IsSame(i))

    let inline getObjFromMap (x:^T) t = 
        let key = t |> toJavaClass
        (^T : (member get : java.lang.Class -> obj) (x, key) )

    let getWords (tokens:seq<CoreLabel>) = 
        [ 
            for token in tokens do
                let word = typeof<CoreAnnotations.TextAnnotation> |> getObjFromMap token :?> Word
                let pos  = typeof<CoreAnnotations.PartOfSpeechAnnotation> |> getObjFromMap token :?> POS
                let ner  = typeof<CoreAnnotations.NamedEntityTagAnnotation> |> getObjFromMap token :?> Ner
                let index = token.index()
                let word = { Word = word; Ner = ner; POS = pos; Index = index }
                yield word
        ]

    let getDependencyGraph words (deps:SemanticGraph)  = 
        [
            for edge in deps.edgeListSorted().toArray() |> Seq.cast<SemanticGraphEdge> do
                let gov = edge.getGovernor()
                let dep = edge.getDependent()

                let govEntity = findIndex words (gov.index())
                let depEntity = findIndex words (dep.index())

                let e = Link(edge.getRelation().getLongName(), govEntity, depEntity)
                yield e
        ]
    
    let rec buildTree words (tree:Tree)  = 
        let label = tree.value()
        let children = tree.children()
        if children.Length = 0 then
            let x = tree.label() :?> CoreLabel
            let i = x.index()

            let entity = findIndex words i
            Node(label, entity)
        else
            let nodes = children |> Seq.map (fun tree -> buildTree words tree) |> Seq.toList
            SubNodes(label, nodes)

    let getMention (mention:edu.stanford.nlp.coref.data.CorefChain.CorefMention) = 
        let mentionId = mention.mentionID
        let span = (mention.startIndex, mention.endIndex)
        let relation = mention.animacy.name()
        let head = (mention.headIndex, mention.mentionSpan)
        let sentenceIndex = mention.sentNum
        let m = { Index = mentionId; Relationship = relation; Span = span; Head = head; SentenceIndex = sentenceIndex; }
        m

    let getMentions (keys:java.util.HashMap) = 
        [
            for key in keys.keySet().toArray() do
                let v = keys.get(key) :?> edu.stanford.nlp.coref.data.CorefChain
                let representiveMention = v.getRepresentativeMention()
                let m = getMention(representiveMention)

                let index = v.getChainID()
                let mentions = v.getMentionsInTextualOrder().toArray()
                let ms = mentions 
                         |> Seq.cast<edu.stanford.nlp.coref.data.CorefChain.CorefMention> 
                         |> Seq.map getMention
                         |> Seq.toList
                let r = { Index = index; RepresentiveMention = m; Mentions = ms; }
                yield r
        ]
    
    let returnSeq<'T> (x:obj) = 
        if x :? java.util.ArrayList then
            toEnumerable<'T> x
        else
            Seq.singleton (x :?> 'T)
    
module NLPExtensions = 
    open NLPUtils
    open edu.stanford.nlp.util

    type CoreMap with
        member this.GetToken<'T> (t:System.Type) = 
            t |> getObjFromMap this |> returnSeq<'T> 

The execution result shows below:




No comments: