Source code for filipino_tokenizer.tagalog.segmenter

from filipino_tokenizer.base import BaseSegmenter
from filipino_tokenizer.tagalog.affixes import TagalogAffixes
from filipino_tokenizer.tagalog.roots import TagalogRoots
from filipino_tokenizer.tagalog.phonology import TagalogPhonology


[docs] class TagalogSegmenter(BaseSegmenter): """ Multi-pass morphological segmenter for Tagalog. Pass order (per SKILL.md): 0. Frozen-form guard — words whose affix analysis is blocked by identical-definition duplicates in the dict. 1. Circumfix detection — ka- -han, pag- -an, etc. 2. Prefix stripping — longest-match-first, recursive for stacked prefixes 3. Infix detection — -um- and -in- after first consonant 4. Suffix stripping — -an/-han, -in/-hin phonology variants 5. Fallback — return [word] unsegmented Root validation: every candidate root is checked against the root dictionary before a segmentation is accepted. Redundancy check: if both the whole word and the stripped root appear in the dictionary with identical definitions the analysis is rejected. This catches frozen forms like 'pangalan' where 'alan' and 'pangalan' share the same definition ("name; reputation; repute; denomination"). _MIN_ROOT = 4: roots shorter than 4 characters are rejected to avoid spurious matches against short dictionary fragments (e.g. 'gka', 'nda') that appear as roots only because the dictionary stores inflected forms under truncated keys. """ VOWELS = frozenset('aeiou') _MIN_ROOT = 4 # minimum characters a valid root must have def __init__(self): self.affixes = TagalogAffixes() self.roots = TagalogRoots() self.phonology = TagalogPhonology() # ------------------------------------------------------------------ # # Public interface # # ------------------------------------------------------------------ #
[docs] def segment(self, word: str) -> list: word = word.lower().strip() if not word: return [] # Guard: frozen/lexicalized forms are returned unsegmented if self._is_frozen(word): return [word] return ( self._try_circumfix(word) or self._try_prefix(word) or self._try_infix(word) or self._try_suffix(word) or [word] )
# ------------------------------------------------------------------ # # Pass 0 — frozen-form guard # # ------------------------------------------------------------------ # def _is_frozen(self, word: str) -> bool: """ Return True if the word is a frozen/lexicalized form that should not be morphologically decomposed. A word is frozen when it is itself in the roots dictionary AND at least one prefix-stripping yields a root whose dictionary definition is identical to the whole word's definition. That identity signals a duplicate/alternate-form entry rather than productive affixation. Example: 'pangalan' (name) → strip 'pang-' → 'alan' (name). Both share the same definition, so the analysis is frozen. """ if not self.roots.is_root(word): return False for prefix in self.affixes.get_prefixes(): p = len(prefix) if len(word) <= p + self._MIN_ROOT - 1: continue if word[:p] != prefix: continue remainder = word[p:] if (self.roots.is_root(remainder) and self._is_redundant(word, remainder)): return True return False # ------------------------------------------------------------------ # # Pass 1 — circumfix # # ------------------------------------------------------------------ # def _try_circumfix(self, word: str) -> list | None: for prefix, suffix in self.affixes.get_circumfixes(): p, s = len(prefix), len(suffix) if len(word) <= p + s: continue if word[:p] != prefix or word[-s:] != suffix: continue core = word[p:-s] if (len(core) >= self._MIN_ROOT and self.roots.is_root(core) and not self._is_redundant(word, core)): return [prefix, core, suffix] return None # ------------------------------------------------------------------ # # Pass 2 — prefix (recursive for stacked) # # ------------------------------------------------------------------ # def _try_prefix(self, word: str, depth: int = 0) -> list | None: if depth > 3: return None for prefix in self.affixes.get_prefixes(): p = len(prefix) # Remainder must be at least _MIN_ROOT chars if len(word) <= p + self._MIN_ROOT - 1: continue if word[:p] != prefix: continue remainder = word[p:] # Try deeper segmentation of remainder before accepting bare root sub = self._try_prefix(remainder, depth + 1) if sub: return [prefix] + sub sub = self._try_infix(remainder) if sub: return [prefix] + sub # Accept remainder as a bare root if (self.roots.is_root(remainder) and not self._is_redundant(word, remainder)): return [prefix, remainder] return None # ------------------------------------------------------------------ # # Pass 3 — infix # # ------------------------------------------------------------------ # def _try_infix(self, word: str) -> list | None: # Infixes attach after the first consonant only if len(word) < 3 or word[0] in self.VOWELS: return None first = word[0] for infix in self.affixes.get_infixes(): n = len(infix) if word[1:1 + n] == infix: root = first + word[1 + n:] if len(root) >= self._MIN_ROOT and self.roots.is_root(root): return [infix, root] return None # ------------------------------------------------------------------ # # Pass 4 — suffix # # ------------------------------------------------------------------ # def _try_suffix(self, word: str) -> list | None: for suffix in self.affixes.get_suffixes(): for root_cand in self.phonology.strip_suffix(word, suffix): if (len(root_cand) >= self._MIN_ROOT and self.roots.is_root(root_cand) and not self._is_redundant(word, root_cand)): surface_suf = self.phonology.apply_suffix_phonology( root_cand, suffix ) return [root_cand, surface_suf] return None # ------------------------------------------------------------------ # # Helpers # # ------------------------------------------------------------------ # def _is_redundant(self, word: str, root_candidate: str) -> bool: """ Return True when the whole word and the candidate root appear in the roots dictionary with identical definitions. """ info_w = self.roots.get_root_info(word) info_r = self.roots.get_root_info(root_candidate) if info_w and info_r: return info_w['definition'].strip() == info_r['definition'].strip() return False