const HTML_SPLIT_REGEX = /<[^>]*>|[^<>]+/g;
const UNICODE_ALPHANUMERIC_REGEX = /^[\p{Alpha}|\p{N}]$/u;
const NEWLINE_PLACEHOLDER = '\uE000';

/**
 * Generates text tokens from the given HTML element based on the specified locale.
 * This function will split the content into sentences and words, so we can find the truncate point.
 *
 * @param {Node | HTMLElement} node - The HTML element to be truncated.
 * @param {String} [locale='en'] - The locale to segment the node with, in BCP 47 language tag format
 * @returns {TextToken[]} An array of parsed text tokens
 */
export function generateTextTokens(node, locale = 'en') {
  const html = (node?.outerHTML ?? node.nodeValue).trim();

  const sentenceSegmenter = new Intl.Segmenter(locale, { granularity: 'sentence' });
  const wordSegmenter = new Intl.Segmenter(locale, { granularity: 'word' });

  const res = [];

  // Remove all HTML tags from input to the segmenter to allow it to process only "real" words
  const sanitizedInput = html
    .match(HTML_SPLIT_REGEX)
    .map((part) => part.startsWith('<') ? ''.padStart(part.length) : part)
    .join('')
    .replace(/\n/g, NEWLINE_PLACEHOLDER);

  const sentenceSegments = Array.from(sentenceSegmenter.segment(sanitizedInput));
  let startSentence = 0;
  let endSentence = 0;
  for (const sentenceSegment of sentenceSegments) {
    let trimmedSentenceSegment = sentenceSegment.segment.trimStart();
    const startWhitespaceLength = sentenceSegment.segment.length - trimmedSentenceSegment.length;
    trimmedSentenceSegment = trimmedSentenceSegment.trimEnd();
    const endWhitespaceLength = sentenceSegment.segment.length - (startWhitespaceLength + trimmedSentenceSegment.length);

    if (startWhitespaceLength > 0) {
      const originalSentence = html
        .substring(startSentence, startSentence + startWhitespaceLength);

      res.push(new TextToken(
        originalSentence,
        originalSentence.trim().length > 0 ? TextTokenType.WORD : TextTokenType.WHITESPACE,
        false,
      ));
    }

    startSentence = endSentence + startWhitespaceLength;
    endSentence = startSentence + trimmedSentenceSegment.length;
    const originalSentence = html.substring(startSentence, endSentence);
    const wordSegments = Array.from(wordSegmenter.segment(trimmedSentenceSegment));

    let startWord = 0;
    let endWord = 0;
    for (const [wordIndex, wordSegment] of wordSegments.entries()) {
      startWord = endWord;
      endWord = startWord + wordSegment.segment.length;
      const originalWord = originalSentence.substring(startWord, endWord);

      res.push(new TextToken(
        originalWord,
        getTextTokenType(originalWord),
        trimmedSentenceSegment.length > 1 && wordIndex === wordSegments.length - 1
      ));
    }

    if (endWhitespaceLength > 0) {
      const originalSentence = html.substring(endSentence, endSentence + endWhitespaceLength);

      res.push(new TextToken(
        originalSentence,
        originalSentence.trim().length > 0 ? TextTokenType.WORD : TextTokenType.WHITESPACE,
        false,
      ));

      endSentence = endSentence + endWhitespaceLength;
    }
  }

  return res;
}

/**
 * @param {string} text - The actual text of the token type
 * @returns {number}
 */
function getTextTokenType(text) {
  if (text.trim().length === 0) {
    return TextTokenType.WHITESPACE;
  } else if (text.length === 1 && !isLetterOrDigit(text)) {
    return TextTokenType.PUNCTUATION;
  } else {
    return TextTokenType.WORD
  }
}

function isLetterOrDigit(char) {
  return UNICODE_ALPHANUMERIC_REGEX.test(char);
}

/**
 * @class
 * @property {string} text - The actual text of the token.
 * @property {TextTokenType} type - The type of the token.
 * @property {boolean} isSentenceEnd - Whether this token is at the end of a sentence.
 */
class TextToken {
  constructor(text, type, isSentenceEnd) {
    this.text = text;
    this.type = type;
    this.isSentenceEnd = isSentenceEnd;
  }
}

/**
 * @typedef {Object} TextTokenType
 * @property {number} WORD
 * @property {number} PUNCTUATION
 * @property {number} WHITESPACE
 */
export const TextTokenType = Object.freeze({
  WORD: 0,
  PUNCTUATION: 1,
  WHITESPACE: 2,
});

export default generateTextTokens;
