Source code for filipino_tokenizer.tagalog.segmenter
from filipino_tokenizer.base import BaseSegmenter
from filipino_tokenizer.tagalog.affixes import TagalogAffixes
from filipino_tokenizer.tagalog.roots import TagalogRoots
from filipino_tokenizer.tagalog.phonology import TagalogPhonology
[docs]
class TagalogSegmenter(BaseSegmenter):
"""
Multi-pass morphological segmenter for Tagalog.
Pass order (per SKILL.md):
0. Frozen-form guard — words whose affix analysis is blocked by
identical-definition duplicates in the dict.
1. Circumfix detection — ka- -han, pag- -an, etc.
2. Prefix stripping — longest-match-first, recursive for stacked prefixes
3. Infix detection — -um- and -in- after first consonant
4. Suffix stripping — -an/-han, -in/-hin phonology variants
5. Fallback — return [word] unsegmented
Root validation: every candidate root is checked against the root
dictionary before a segmentation is accepted.
Redundancy check: if both the whole word and the stripped root appear in
the dictionary with identical definitions the analysis is rejected.
This catches frozen forms like 'pangalan' where 'alan' and 'pangalan'
share the same definition ("name; reputation; repute; denomination").
_MIN_ROOT = 4: roots shorter than 4 characters are rejected to avoid
spurious matches against short dictionary fragments (e.g. 'gka', 'nda')
that appear as roots only because the dictionary stores inflected forms
under truncated keys.
"""
VOWELS = frozenset('aeiou')
_MIN_ROOT = 4 # minimum characters a valid root must have
def __init__(self):
self.affixes = TagalogAffixes()
self.roots = TagalogRoots()
self.phonology = TagalogPhonology()
# ------------------------------------------------------------------ #
# Public interface #
# ------------------------------------------------------------------ #
[docs]
def segment(self, word: str) -> list:
word = word.lower().strip()
if not word:
return []
# Guard: frozen/lexicalized forms are returned unsegmented
if self._is_frozen(word):
return [word]
return (
self._try_circumfix(word) or
self._try_prefix(word) or
self._try_infix(word) or
self._try_suffix(word) or
[word]
)
# ------------------------------------------------------------------ #
# Pass 0 — frozen-form guard #
# ------------------------------------------------------------------ #
def _is_frozen(self, word: str) -> bool:
"""
Return True if the word is a frozen/lexicalized form that should not
be morphologically decomposed.
A word is frozen when it is itself in the roots dictionary AND at
least one prefix-stripping yields a root whose dictionary definition
is identical to the whole word's definition. That identity signals
a duplicate/alternate-form entry rather than productive affixation.
Example: 'pangalan' (name) → strip 'pang-' → 'alan' (name).
Both share the same definition, so the analysis is frozen.
"""
if not self.roots.is_root(word):
return False
for prefix in self.affixes.get_prefixes():
p = len(prefix)
if len(word) <= p + self._MIN_ROOT - 1:
continue
if word[:p] != prefix:
continue
remainder = word[p:]
if (self.roots.is_root(remainder)
and self._is_redundant(word, remainder)):
return True
return False
# ------------------------------------------------------------------ #
# Pass 1 — circumfix #
# ------------------------------------------------------------------ #
def _try_circumfix(self, word: str) -> list | None:
for prefix, suffix in self.affixes.get_circumfixes():
p, s = len(prefix), len(suffix)
if len(word) <= p + s:
continue
if word[:p] != prefix or word[-s:] != suffix:
continue
core = word[p:-s]
if (len(core) >= self._MIN_ROOT
and self.roots.is_root(core)
and not self._is_redundant(word, core)):
return [prefix, core, suffix]
return None
# ------------------------------------------------------------------ #
# Pass 2 — prefix (recursive for stacked) #
# ------------------------------------------------------------------ #
def _try_prefix(self, word: str, depth: int = 0) -> list | None:
if depth > 3:
return None
for prefix in self.affixes.get_prefixes():
p = len(prefix)
# Remainder must be at least _MIN_ROOT chars
if len(word) <= p + self._MIN_ROOT - 1:
continue
if word[:p] != prefix:
continue
remainder = word[p:]
# Try deeper segmentation of remainder before accepting bare root
sub = self._try_prefix(remainder, depth + 1)
if sub:
return [prefix] + sub
sub = self._try_infix(remainder)
if sub:
return [prefix] + sub
# Accept remainder as a bare root
if (self.roots.is_root(remainder)
and not self._is_redundant(word, remainder)):
return [prefix, remainder]
return None
# ------------------------------------------------------------------ #
# Pass 3 — infix #
# ------------------------------------------------------------------ #
def _try_infix(self, word: str) -> list | None:
# Infixes attach after the first consonant only
if len(word) < 3 or word[0] in self.VOWELS:
return None
first = word[0]
for infix in self.affixes.get_infixes():
n = len(infix)
if word[1:1 + n] == infix:
root = first + word[1 + n:]
if len(root) >= self._MIN_ROOT and self.roots.is_root(root):
return [infix, root]
return None
# ------------------------------------------------------------------ #
# Pass 4 — suffix #
# ------------------------------------------------------------------ #
def _try_suffix(self, word: str) -> list | None:
for suffix in self.affixes.get_suffixes():
for root_cand in self.phonology.strip_suffix(word, suffix):
if (len(root_cand) >= self._MIN_ROOT
and self.roots.is_root(root_cand)
and not self._is_redundant(word, root_cand)):
surface_suf = self.phonology.apply_suffix_phonology(
root_cand, suffix
)
return [root_cand, surface_suf]
return None
# ------------------------------------------------------------------ #
# Helpers #
# ------------------------------------------------------------------ #
def _is_redundant(self, word: str, root_candidate: str) -> bool:
"""
Return True when the whole word and the candidate root appear in the
roots dictionary with identical definitions.
"""
info_w = self.roots.get_root_info(word)
info_r = self.roots.get_root_info(root_candidate)
if info_w and info_r:
return info_w['definition'].strip() == info_r['definition'].strip()
return False