import _ from 'lodash';
import sortBy from 'lodash/fp/sortBy';
import find from 'lodash/fp/find';
import { overlap, translateOffset } from './offsetUtils';
import Tokenizer from '../../utils/textTokenize'
import { Offset } from "../../application/model";

// this regex is copied from 'tokenize-text' lib but excludes the dash char from being a word boundary (allowing french compound words)
const WORD_BOUNDARY_CHARS = '\t\r\n\u00A0 !\"#$%&()*+,.\\/:;<=>?@\[\\\]^_`{|}~';
const SPLIT_REGEX = new RegExp('([^' + WORD_BOUNDARY_CHARS + ']+)');
const MAX_WORDS_BEFORE_OFFSET = 5;
const tokenizer = new Tokenizer();
const words = tokenizer.re(SPLIT_REGEX);

export interface KeepHighlightedSentencesResult {
  text: string,
  offsets: Array<Offset>,
  truncatedLeft: boolean
}

export const keepHighlightedSentences = (text: string, offsets: Array<Offset>, sentences: Array<Offset>): KeepHighlightedSentencesResult => {
  if (_.isEmpty(offsets) || _.isEmpty(text)) {
    return {
      text,
      offsets,
      truncatedLeft: false
    };
  }

  if (_.isEmpty(sentences)) {
    sentences = [{ start: 0, end: text.length - 1 }];
  }

  const firstOffset = _.sortBy(offsets, 'start')[0];

  if (firstOffset.start === 0) {
    return {
      text,
      offsets,
      truncatedLeft: false
    };
  }

  const offsetOverlap = overlap.bind(this, firstOffset);
  const firstSentence:any = _.flow(
    sortBy('start'),
    find(offsetOverlap)
  )(sentences);

  // count number of words between sentence start and offset start
  const tokens = words(text.substring(firstSentence.start, firstOffset.start));

  // and make sure we keep no more that 5 words
  const startIndex = tokens.length <= MAX_WORDS_BEFORE_OFFSET ?
                     firstSentence.start :
                     firstSentence.start + tokens[tokens.length - MAX_WORDS_BEFORE_OFFSET].index;

  return {
    text: text.substring(startIndex, text.length),
    offsets: _.map(offsets, o => translateOffset(o, -startIndex)),
    truncatedLeft: startIndex > 0
  };
};
