import { JSONContent } from '@tiptap/core';
import { EMPTY } from './markEmptyNodes';
import { Token } from './types';

/**
 * Build a shallow copy of a node to use in a token's path.
 *
 * - For nodes marked as empty (i.e. with a property keyed by EMPTY), copy all keys except the EMPTY flag.
 * - For text nodes (type === "text"), copy all keys except "text".
 * - For other nodes, copy all keys except "content".
 *
 * @param {JSONContent} node - The node to create a path element from
 * @returns {JSONContent} A new node without content/text/EMPTY properties
 */
function buildPathElement(node: JSONContent): JSONContent {
  const newNode: JSONContent = {};
  // If the node is marked as empty, copy all keys except the EMPTY flag.
  if (node[EMPTY]) {
    for (const key in node) {
      if (key !== EMPTY) {
        newNode[key] = node[key];
      }
    }
    return newNode;
  }
  // For text nodes, remove the "text" key.
  if (node.type === 'text') {
    for (const key in node) {
      if (key !== 'text') {
        newNode[key] = node[key];
      }
    }
    return newNode;
  }
  // For other nodes (containers), remove the "content" key.
  for (const key in node) {
    if (key !== 'content') {
      newNode[key] = node[key];
    }
  }
  return newNode;
}

const splitIntoWords = (text: string) => {
  // Word pattern: match words, punctuation, and whitespace separately
  const wordPattern = /(\w+|\s+|[^\w\s]+)/g;
  const words = text.match(wordPattern) || [];

  return words;
};

/**
 * Tokenize a ProseMirror JSON document.
 *
 * - Traverses the tree recursively.
 * - For text nodes, if the node is marked empty (via the EMPTY flag), emits one token with the EMPTY character.
 *   Otherwise, splits the text into individual character tokens.
 * - Each token has a "path" which is an array of nodes representing the hierarchy,
 *   built using the buildPathElement helper.
 *
 * @param {JSONContent} doc - The ProseMirror document to tokenize
 * @returns {Token[]} An array of tokens representing the document
 */
function tokenize(doc: JSONContent, splitBy: 'word' | 'char'): Token[] {
  const tokens: Token[] = [];

  function traverse(node: JSONContent, path: JSONContent[]): void {
    const currentPathElement = buildPathElement(node);
    const newPath = path.concat(currentPathElement);

    if (node[EMPTY]) {
      tokens.push({ character: EMPTY, path: newPath.map((o) => ({ ...o })) });
    } else if (node.type === 'text' && node.text !== undefined) {
      if (splitBy === 'word') {
        // Split text into words rather than characters
        const words = splitIntoWords(node.text);
        for (const word of words) {
          tokens.push({
            character: word,
            path: newPath.map((o) => ({ ...o })),
          });
        }
      } else if (splitBy === 'char') {
        for (const char of node.text) {
          tokens.push({
            character: char,
            path: newPath.map((o) => ({ ...o })),
          });
        }
      } else {
        throw new Error(
          `Invalid splitBy option: ${splitBy}. Must be 'word' or 'char'.`
        );
      }
    }

    if (node.content && Array.isArray(node.content)) {
      for (const child of node.content) {
        traverse(child, newPath);
      }
    }
  }

  traverse(doc, []);
  return tokens;
}

export { tokenize };
