pipeline.js - Documentation

import difference from 'lodash.difference';
import Service from './service';
import ConnectorServer from './connector/connector-server';
import tokenize from './simple/annotator/tokenize';
import ssplit from './simple/annotator/ssplit';
import pos from './simple/annotator/pos';
import lemma from './simple/annotator/lemma';
import ner from './simple/annotator/ner';
import parse from './simple/annotator/parse';
import depparse from './simple/annotator/depparse';
import relation from './simple/annotator/relation';
import regexner from './simple/annotator/regexner';
import coref from './simple/annotator/coref';
import Document from './simple/document';

import {
  TokensRegexAnnotator,
  SemgrexAnnotator,
  TregexAnnotator,
} from './simple/expression';

const ANNOTATORS_BY_KEY = {
  tokenize,
  ssplit,
  pos,
  lemma,
  ner,
  parse,
  depparse,
  relation,
  regexner,
  coref,
};

const LANGUAGE_TO_ISO2 = {
  English: 'en',
  French: 'fr',
  German: 'de',
  Spanish: 'es',
};

/**
 * @class
 * @classdesc Class representing a Pipeline.
 */
class Pipeline {
  /**
   * Create a Pipeline
   * @param {Properties} properties
   * @param {string} [language] - in CamelCase (i.e. English, Spanish)
   * @param {ConnectorServer|ConnectorCli} [connector]
   */
  constructor(properties, language = 'Unspecified', connector = null) {
    this._properties = properties;
    this._language = language;
    this._connector = connector || new ConnectorServer({});
    this._service = new Service(this._connector, this._language);
  }

  /**
   * Retrieves the current Service used by the pipeline
   * @param {Service} service
   */
  getService() {
    return this._service;
  }

  /**
   * Execute the pipeline against the annotable object, adding annotations to it.
   * Calls the service and loads the associated response metadata into the Annotable model
   * @async
   * @param {Annotable} annotable - the document or sentence to be annotated
   * @returns {Promise<Annotable>} annotated document / sentence
   */
  async annotate(annotable) {
    annotable.fromJSON(await this._service.getAnnotationData(
      annotable.text(),
      this._getAnnotatorsKeys(),
      this._getAnnotatrosOptions()));

    annotable.setLanguageISO(LANGUAGE_TO_ISO2[this._language]);
    annotable.addAnnotators(this._getAnnotators());

    return annotable;
  }

  /**
   * @param {Array.<Annotator>} requiredAnnotators
   */
  assert(methodName = '', requiredAnnotators = []) {
    if (difference(
      requiredAnnotators.map(Annotator => (new Annotator()).toString()),
      this._getAnnotatorsKeys()).length > 0) {
      throw new Error(`Assert: ${methodName} requires ${requiredAnnotators.join()} within the annotators list.`);
    }
  }

  /**
   * Annotates the given Expression instance with matching groups and/or Tokens
   * @param {Expression} expression - An annotable expression containing a TokensRegex pattern
   * @param {boolean} [annotateExpression] - Whether to hydrate the annotations with tokens or not.
   * IMPORTANT: The optional parameter `annotateExpression` if true, will run the CoreNLP pipeline
   *            twice.  First for the TokensRegex annotation, and one more for the standard pipeline
   *            Token annotations (pos, ner, lemma, etc).
   * @returns {Expression} expression - The current expression instance
   */
  async annotateTokensRegex(annotable, annotateExpression = false) {
    this.assert('TokensRegex', [tokenize, ssplit]);
    annotable.fromJSON(await this._service.getTokensRegexData(
      annotable.text(),
      annotable.pattern(),
      this._getAnnotatorsKeys(),
      this._getAnnotatrosOptions()));

    annotable.setLanguageISO(LANGUAGE_TO_ISO2[this._language]);
    annotable.addAnnotator(TokensRegexAnnotator);

    if (annotateExpression) {
      return this._annotateExpression(annotable);
    }
    return annotable;
  }

  /**
   * Annotates the given Expression instance with matching groups and/or Tokens
   * @param {Expression} expression - An annotable expression containing a Semgrex pattern
   * @param {boolean} [annotateExpression] - Whether to hydrate the annotations with tokens or not.
   * IMPORTANT: The optional parameter `annotateExpression` if true, will run the CoreNLP pipeline
   *            twice.  First for the Semgrex annotation, and one more for the standard pipeline
   *            Token annotations (pos, ner, lemma, etc).
   * @returns {Expression} expression - The current expression instance
   */
  async annotateSemgrex(annotable, annotateExpression = false) {
    this.assert('Semgrex', [tokenize, ssplit, depparse]);
    annotable.fromJSON(await this._service.getSemgrexData(
      annotable.text(),
      annotable.pattern(),
      this._getAnnotatorsKeys(),
      this._getAnnotatrosOptions()));

    annotable.setLanguageISO(LANGUAGE_TO_ISO2[this._language]);
    annotable.addAnnotator(SemgrexAnnotator);

    if (annotateExpression) {
      return this._annotateExpression(annotable);
    }
    return annotable;
  }

  /**
   * Annotates the given Expression instance with matching groups and/or Tokens
   * @param {Expression} expression - An annotable expression containing a Tregex pattern
   * @param {boolean} [annotateExpression] - Whether to hydrate the annotations with tokens or not.
   * IMPORTANT: The optional parameter `annotateExpression` if true, will run the CoreNLP pipeline
   *            twice.  First for the Tregex annotation, and one more for the standard pipeline
   *            Token annotations (pos, ner, lemma, etc).
   * @returns {Expression} expression - The current expression instance
   */
  async annotateTregex(annotable, annotateExpression = false) {
    this.assert('Tregex', [tokenize, ssplit, parse]);
    annotable.fromJSON(await this._service.getTregexData(
      annotable.text(),
      annotable.pattern(),
      this._getAnnotatorsKeys(),
      this._getAnnotatrosOptions()));

    annotable.setLanguageISO(LANGUAGE_TO_ISO2[this._language]);
    annotable.addAnnotator(TregexAnnotator);

    if (annotateExpression) {
      return this._annotateExpression(annotable);
    }
    return annotable;
  }


  /**
   * @private
   * @description
   * Runs the default pipeline over the same text of the expression, and merges the results
   */
  async _annotateExpression(annotableExpression) {
    const doc = await this.annotate(new Document(annotableExpression.text()));
    doc.setLanguageISO(LANGUAGE_TO_ISO2[this._language]);
    annotableExpression.mergeTokensFromDocument(doc);
    return annotableExpression;
  }

  /**
   * @private
   */
  async _semgrex(text, pattern) {
    const data = await this._service.getSemgrexData(
      text,
      pattern,
      this._getAnnotatorsKeys(),
      this._getAnnotatrosOptions());

    return data;
  }

  /**
   * @private
   * @returns {Aray.<string>} annotators - those set for this pipeline
   */
  _getAnnotatorsKeys() {
    return this._properties.getProperty('annotators', '')
      .split(',').map(annotatorKey => annotatorKey.trim());
  }

  /**
   * @private
   * @returns {Aray.<Annotator>} annotators - those set for this pipeline
   */
  _getAnnotators() {
    return this._getAnnotatorsKeys()
      .map(annotatorKey => ANNOTATORS_BY_KEY[annotatorKey]);
  }

  /**
   * Only given options are those related to the annotators in the pipeline
   * @private
   * @returns {Aray.<Annotator>} annotators - those set for this pipeline
   */
  _getAnnotatrosOptions() {
    const pipelineProps = this._properties.getProperties();
    const validPrfixes = Object.keys(ANNOTATORS_BY_KEY);
    return Object.keys(pipelineProps)
      .filter(propName => validPrfixes.indexOf(propName) === 0)
      .reduce((acc, val, key) => ({ ...acc, [key]: val }), {});
  }
}


export default Pipeline;