import _ from 'lodash';
import { RGX_LINE_BRAKES } from './DataExtraction/regexExtraction';

const reduceNumberOfWords = (words: string[], maxLen: number) =>
  maxLen < words.length ? words.slice(maxLen) : words;

const padSentences = (sentences: number[], maxLen: number, value = 0) => {
  const diff = maxLen - sentences.length;
  // post 0 to diff +1
  if (diff > 0) return [...sentences, ..._.fill(Array(diff), value)];
  return sentences;
};

export const tokenizeWithWordIndex = (
  sentence: string,
  wordIndex: {
    [key: string]: number;
  },
  maxLen: number
) => {
  const words = sentence
    .toLowerCase()
    .replace(/[$&+,:;=?@#|'<>.^*()%_!-]/gi, '')
    .replace(/[0-9]/g, '')
    .replace('/', '')
    .replace(RGX_LINE_BRAKES, '')
    .split(' ');
  const wordsSliced = reduceNumberOfWords(words, maxLen);
  const sentences = wordsSliced.reduce((prev: number[], curr) => {
    if (wordIndex[curr] === undefined) return [...prev, 1];
    return [...prev, wordIndex[curr]];
  }, []);
  const paddedSentences = padSentences(sentences, maxLen);
  return paddedSentences;
};