Move NLP IKVM to F# .NET-Friendly

Since the last post, I read Sergey's code. Then I decided to work on refactoring the code to store the data into the .NET and F# format. Stanford NLP does provide a server. I still want to make it .Net friendly and also get myself familiar with the NLP core.

I prefer the project-based solution than the interactive solution is because it can have Visual Studio's watch, immediate windows, and debug visualizer. Once I got the information into a comfortable environment, it will be easy to move forward.

The 200-line code is to build up a structure like the following:

I call multiple sentences a story. A story contains (1) sentences and (2) cross-references. The sentence structure shows NLP info about a sentence. The sentence includes a token list, tree, and dependency graph. The cross-references maintain the relationship among elements from different sentences. The first file is the main file. It shows how to invoke the underlying functions and show the structures.

// Learn more about F# at http://fsharp.org
// See the 'F# Tutorial' project for more help.

open System
open System.IO
open java.util
open java.io
open edu.stanford.nlp.pipeline
open edu.stanford.nlp.ling
open Utils.NLPUtils
open Utils.NLPExtensions
open edu.stanford.nlp.util
open edu.stanford.nlp.trees
open edu.stanford.nlp.semgraph
open Utils.NLPStructures
open edu.stanford.nlp.coref

[<EntryPoint>]
let main argv = 
    let text = "Kosgi Santosh sent an email to Stanford University. He didn't get a reply email.";

    // Annotation pipeline configuration
    let props = Properties()
    props.setProperty("annotators","tokenize, ssplit, pos, lemma, ner, parse, dcoref") |> ignore
    props.setProperty("ner.useSUTime","0") |> ignore

    let pipeline = StanfordCoreNLP(props)

    // Annotation
    let annotation = Annotation(text)
    pipeline.annotate(annotation)

    //get annotation info
    let keys = annotation.GetToken<HashMap>(typeof<CorefCoreAnnotations.CorefChainAnnotation>)
    let mentions = keys |> Seq.exactlyOne |> getMentions        

    let sentences = 
        [
            let sentences = annotation.GetToken<CoreMap>(typeof<CoreAnnotations.SentencesAnnotation>)

            for s in sentences do
                let tokens = s.GetToken<CoreLabel>(typeof<CoreAnnotations.TokensAnnotation>)
                let words = getWords tokens

                let t = s.GetToken<Tree>(typeof<TreeCoreAnnotations.TreeAnnotation>)  
                let tree = t |> Seq.exactlyOne |> buildTree words
       
                let deps = s.GetToken<SemanticGraph>(typeof<SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation>)
                let relationships = deps |> Seq.exactlyOne |> getDependencyGraph words
                let sentence = { Words = words; Dependency = relationships; Tree = tree; }
                yield sentence
            ]

    let story = 
        {
            CrossLinks = mentions;
            Sentences = sentences;
        }

    printfn "%O" story

    0 // return an integer exit code

The second file is the library file. I do not think the structure will stay same after two weeks. I might decide to add more fields. But currently the foundation is there.

namespace Utils

module NLPStructures = 

    type Word = string
    type Ner = string
    type POS = string
    type Index = int
    type Relationship = string
    type Span = int * int
    type Head = int * string
    type SentenceIndex = int
    
    type WordType = 
        {
          Word : Word
          Ner: Ner
          POS: POS
          Index: Index
        }

        member this.IsSame word = 
            match this with
            | { Word = w; } -> w = word

        member this.IsSame index = 
            match this with
            | { Index = i } -> i = index

    type MentionEntity =
            {
                Index: Index
                Relationship: Relationship
                Span : Span
                Head : Head
                SentenceIndex : SentenceIndex
            }
    
    type RepresentiveMention = MentionEntity

    type CrossLinkType =
        {
            Index: Index
            RepresentiveMention : RepresentiveMention
            Mentions : MentionEntity list
        }
    
    type DependencyGraph = 
        | Link of Relationship * WordType * WordType
        | CrossLink of CrossLinkType
        
    type TreeNode = 
        | Node of string * WordType
        | SubNodes of POS * TreeNode list
    
    type SentenceType =
        {
            Words: WordType list
            Dependency: DependencyGraph list
            Tree: TreeNode
        }

    type StoryType = 
        {
            CrossLinks : CrossLinkType list
            Sentences : SentenceType list
        }

module NLPUtils =
    open edu.stanford.nlp.trees
    open edu.stanford.nlp.ling
    open edu.stanford.nlp.semgraph

    open NLPStructures

    let toEnumerable<'T> (obj:obj) = 
        let l = obj :?> java.util.ArrayList
        l |> Seq.cast<'T>    
            
    let toJavaClass (t:System.Type) = java.lang.Class.op_Implicit(t)

    let findWord (words:WordType list) (word:Word) = 
        words |> Seq.find (fun n -> n.IsSame(word))
    let findIndex (words:WordType list) (i:Index) = 
        words |> Seq.find (fun n -> n.IsSame(i))

    let inline getObjFromMap (x:^T) t = 
        let key = t |> toJavaClass
        (^T : (member get : java.lang.Class -> obj) (x, key) )

    let getWords (tokens:seq<CoreLabel>) = 
        [ 
            for token in tokens do
                let word = typeof<CoreAnnotations.TextAnnotation> |> getObjFromMap token :?> Word
                let pos  = typeof<CoreAnnotations.PartOfSpeechAnnotation> |> getObjFromMap token :?> POS
                let ner  = typeof<CoreAnnotations.NamedEntityTagAnnotation> |> getObjFromMap token :?> Ner
                let index = token.index()
                let word = { Word = word; Ner = ner; POS = pos; Index = index }
                yield word
        ]

    let getDependencyGraph words (deps:SemanticGraph)  = 
        [
            for edge in deps.edgeListSorted().toArray() |> Seq.cast<SemanticGraphEdge> do
                let gov = edge.getGovernor()
                let dep = edge.getDependent()

                let govEntity = findIndex words (gov.index())
                let depEntity = findIndex words (dep.index())

                let e = Link(edge.getRelation().getLongName(), govEntity, depEntity)
                yield e
        ]
    
    let rec buildTree words (tree:Tree)  = 
        let label = tree.value()
        let children = tree.children()
        if children.Length = 0 then
            let x = tree.label() :?> CoreLabel
            let i = x.index()

            let entity = findIndex words i
            Node(label, entity)
        else
            let nodes = children |> Seq.map (fun tree -> buildTree words tree) |> Seq.toList
            SubNodes(label, nodes)

    let getMention (mention:edu.stanford.nlp.coref.data.CorefChain.CorefMention) = 
        let mentionId = mention.mentionID
        let span = (mention.startIndex, mention.endIndex)
        let relation = mention.animacy.name()
        let head = (mention.headIndex, mention.mentionSpan)
        let sentenceIndex = mention.sentNum
        let m = { Index = mentionId; Relationship = relation; Span = span; Head = head; SentenceIndex = sentenceIndex; }
        m

    let getMentions (keys:java.util.HashMap) = 
        [
            for key in keys.keySet().toArray() do
                let v = keys.get(key) :?> edu.stanford.nlp.coref.data.CorefChain
                let representiveMention = v.getRepresentativeMention()
                let m = getMention(representiveMention)

                let index = v.getChainID()
                let mentions = v.getMentionsInTextualOrder().toArray()
                let ms = mentions 
                         |> Seq.cast<edu.stanford.nlp.coref.data.CorefChain.CorefMention> 
                         |> Seq.map getMention
                         |> Seq.toList
                let r = { Index = index; RepresentiveMention = m; Mentions = ms; }
                yield r
        ]
    
    let returnSeq<'T> (x:obj) = 
        if x :? java.util.ArrayList then
            toEnumerable<'T> x
        else
            Seq.singleton (x :?> 'T)
    
module NLPExtensions = 
    open NLPUtils
    open edu.stanford.nlp.util

    type CoreMap with
        member this.GetToken<'T> (t:System.Type) = 
            t |> getObjFromMap this |> returnSeq<'T>

The execution result shows below:

Apollo 13 - Tao Liu's blog

Pages

Sunday, November 11, 2018

Move NLP IKVM to F# .NET-Friendly

No comments: