simple/token.js

import Service from '../service';
import Annotable from './annotable';

// @see {@link https://github.com/stanfordnlp/CoreNLP/blob/master/src/edu/stanford/nlp/simple/Token.java}

/**
 * The CoreNLP API JSON structure representing a token
 * @typedef TokenJSON
 * @property {number} index
 * @property {string} word
 * @property {string} originalText
 * @property {number} characterOffsetBegin
 * @property {number} characterOffsetEnd
 * @property {string} before
 * @property {string} after
 */

/**
 * @typedef PosInfo
 * @description
 * PosInfo does not come as part of the CoreNLP.  It is an indexed reference of POS tags
 * by language provided by this library.  It's only helpful for analysis and study.  The
 * data was collected from different documentation resources on the Web.
 * The PosInfo may vary depending on the POS annotation types used, for example, CoreNLP
 * for Spanish uses custom POS tags developed by Stanford, but this can also be changed
 * to Universal Dependencies, which uses different tags.
 * @property {string} group
 * @property {string} tag
 * @property {Array.<string>} examples
 */

/**
 * @class
 * @classdesc Class representing a Token
 * @extends Annotable
 * @memberof CoreNLP/simple
 */
class Token extends Annotable {
  /**
   * Create a Token
   * @param {string} word
   */
  // eslint-disable-next-line no-useless-constructor
  constructor(word) {
    super(word);
  }

  /**
   * Get a string representation
   * @returns {string} token
   */
  toString() {
    return this._text;
  }

  /**
   * Get the `inde ` number associated by the StanfordCoreNLP
   * This index is relative to the sentence it belongs to, and is a 1-based (possitive integer).
   * This number is useful to match tokens within a sentence for depparse, coreference, etc.
   * @returns {number} index
   */
  index() {
    return this._index;
  }

  /**
   * Get the original word
   * @returns {string} word
   */
  word() {
    return this._text;
  }

  /**
   * Get the original text
   * @returns {string} originalText
   */
  originalText() {
    return this._originalText;
  }

  /**
   * Get the characterOffsetBegin relative to the parent sentence
   * @description
   * A 0-based index of the word's initial character within the sentence
   * @returns {number} characterOffsetBegin
   */
  characterOffsetBegin() {
    return this._characterOffsetBegin;
  }

  /**
   * Get the characterOffsetEnd relative to the parent sentence
   * A 0-based index of the word's ending character within the sentence
   * @returns {number} characterOffsetEnd
   */
  characterOffsetEnd() {
    return this._characterOffsetEnd;
  }

  /**
   * Get the `before` string relative to the container sentence
   * @returns {string} before
   */
  before() {
    return this._before;
  }

  /**
   * Get the `after` string relative to the container sentence
   * @returns {string} after
   */
  after() {
    return this._after;
  }

  /**
   * Get the annotated lemma
   * @returns {string} lemma
   */
  lemma() {
    return this._lemma;
  }

  /**
   * Get the annotated part-of-speech for the current token
   * @returns {string} pos
   */
  pos() {
    return this._pos;
  }

  /**
   * Get additional metadata about the POS annotation
   * NOTE: Do not use this method other than just for study or analysis purposes.
   * @see {@link PosInfo} for more details
   * @returns {PosInfo} posInfo
   */
  posInfo() {
    return Service.getTokenPosInfo(this._pos, this.getLanguageISO());
  }

  /**
   * Get the annotated named-entity for the current token
   * @returns {string} ner
   */
  ner() {
    return this._ner;
  }

  /**
   * Get the annotated speaker for the current token
   * @see {@link CorefAnnotator}
   * @returns {string} speaker
   */
  speaker() {
    return this._speaker;
  }

  /**
   * Get a JSON representation of the current token
   * @description
   * The following arrow function `data => Token.fromJSON(data).toJSON()` is idempontent, if
   * considering shallow comparison, not by reference.
   * This JSON will respects the same structure as it expects from {@see Token#fromJSON}.
   * @returns {TokenJSON} data
   */
  toJSON() {
    return {
      index: this._index,
      word: this._text,
      originalText: this._originalText,
      characterOffsetBegin: this._characterOffsetBegin,
      characterOffsetEnd: this._characterOffsetEnd,
      before: this._before,
      after: this._after,
      // annotations metadata
      pos: this._pos,
      lemma: this._lemma,
      ner: this._ner,
      speaker: this._speaker,
    };
  }

  /**
   * Get an instance of Token from a given JSON
   * @param {TokenJSON} data - The token data, as returned by CoreNLP API service
   * @returns {Token} token - A new Token instance
   */
  static fromJSON(data) {
    const instance = new this();
    instance._index = data.index;
    instance._text = data.word;
    instance._originalText = data.originalText;
    instance._characterOffsetBegin = data.characterOffsetBegin;
    instance._characterOffsetEnd = data.characterOffsetEnd;
    instance._before = data.before;
    instance._after = data.after;
    // annotations metadata
    instance._pos = data.pos;
    instance._lemma = data.lemma;
    instance._ner = data.ner;
    instance._speaker = data.speaker;
    return instance;
  }
}

export default Token;