Source code for mecab_text_cleaner._main

from __future__ import annotations

import re
import warnings
from logging import getLogger
from typing import Callable, Literal, NamedTuple

import fugashi

LOG = getLogger(__name__)


class UnidicFeatures17(NamedTuple):
    """https://clrd.ninjal.ac.jp/unidic/faq.html
    https://clrd.ninjal.ac.jp/unidic/UNIDIC_manual.pdf"""

    # ChaSen品詞体系
    pos1: str
    """品詞大分類(ChaSen品詞体系)"""
    pos2: str
    """品詞中分類(ChaSen品詞体系)"""
    pos3: str
    """品詞小分類(ChaSen品詞体系)"""
    pos4: str
    """品詞細分類(ChaSen品詞体系)"""

    # 語彙素
    lForm: str
    """語彙素読み

    語彙素見出し(カタカナ表記)"""
    lemma: str
    """語彙素(+語彙素細分類)

    語彙素見出し(漢字仮名混じり表記)"""
    goshu: str
    """語種

    語種の名称"""

    # 語形
    cType: str
    """活用型

    活用の種類(型)"""
    cForm: str
    """活用形

    活用の形"""
    iType: str
    """語頭変化型

    語頭音変化の種類(型)"""
    iForm: str
    """語頭変化形

    語頭音変化の形"""
    fType: str
    """語末変化型

    語末音変化の種類(型)"""
    fForm: str
    """語末変化形

    語末音変化の形"""

    # 書字形
    orth: str
    """書字形出現形

    書字形基本形が活用変化を受けたもの"""
    orthBase: str
    """書字形基本形

    書字形見出し"""

    # 発音形
    pron: str
    """発音形出現形

    発音形基本形が活用変化を受けたもの"""
    pronBase: str
    """発音形基本形

    発音形見出し"""


class UnidicFeatures26(UnidicFeatures17):
    kana: str
    """仮名形出現形

    書字形基本形をカタカナ表記にしたもの"""
    kanaBase: str
    """仮名形基本形

    書字形出現形をカタカナ表記にしたもの"""
    form: str
    """語形出現形

    語形が活用変化を受けたもの"""
    formBase: str
    """語形基本形

    語形見出し"""
    iConType: str
    """語頭変化結合型

    後続要素の語頭変化形への制約の種類(型)"""
    fConType: str
    """語末変化結合型"""
    aType: str
    """アクセント型

    アクセント核の位置"""
    aConType: str
    """アクセント結合型

    前接(後続)要素との結合時のアクセント変化の種類(型)"""
    aModType: str
    """アクセント修飾型

    活用によるアクセント変化の種類(型)"""


class UnidicFeatures29(UnidicFeatures26):
    type: str
    """品詞"""
    lid: str
    """語彙表ID"""
    lemma_id: str
    """語彙素ID"""


def _get_tagger() -> fugashi.Tagger:
    try:
        return fugashi.Tagger()
    except BaseException as e:
        LOG.exception(e)

        import importlib.util

        if importlib.util.find_spec("unidic"):
            LOG.info("unidic is installed, trying to download unidic")
            import unidic.download

            unidic.download.download_version()
            return fugashi.Tagger()
        raise RuntimeError("fugashi.Tagger() failed") from e


[docs]def to_reading( text: str, reading_type: Literal["orth", "pron", "kana"] = "pron", add_atype: bool = True, add_blank_between_words: bool = True, when_unknown: Literal["passthrough", "*", "unidecode"] | Callable[[str], str] = "passthrough", tagger: fugashi.Tagger = _get_tagger(), ) -> str: """Convert text to reading. Note that MeCab interprets spaces as word boundaries, and will be removed. Lines (\\n only) are restored later. Parameters ---------- text : str The text to convert. reading_type : Literal[&quot;orth&quot;, &quot;pron&quot;, &quot;kana&quot;], optional Reading type, by default "pron" "pron" is the pronunciation (発音形), "orth" is the orthography (書字形), "kana" is the kana(仮名) form of orthography add_atype : bool, optional Whether to consider aType (アクセント型) and add "]" to the reading, by default True add_blank_between_words : bool, optional Whether to add a blank between words, by default True when_unknown : Literal[&quot;passthrough&quot;, , optional What to do when the reading is unknown ("補助記号" and "一般"), by default "passthrough" "passthrough" will pass the original text, "*" will pass "*", "unidecode" will use unidecode, and a callable will be called with the original text tagger : fugashi.Tagger, optional The tagger to use, by default fugashi.Tagger() Returns ------- str The reading Raises ------ ImportError When when_unknown="unidecode" and unidecode is not installed Examples -------- >>> from mecab_text_cleaner import to_reading >>> to_reading(" 空、雲。\\n雨!(") 'ソ]ラ、 ク]モ。\\nア]メ!(' """ if when_unknown == "unidecode": # check unidecode first import unidecode res = "" for line in text.splitlines(): for word in tagger(line): LOG.debug(f"word={word}, feature={word.feature}") reading = getattr(word.feature, reading_type) if reading in ("", "*", None): # unknown reading if not (word.feature.pos1 == "補助記号" and word.feature.pos2 == "一般"): # known symbol if add_blank_between_words: res = res[:-1] res += word.surface # unknown symbol elif when_unknown == "passthrough": res += word.surface elif when_unknown == "*": res += "*" elif when_unknown == "unidecode": res += unidecode.unidecode(word.surface) elif callable(when_unknown): res += when_unknown(word.surface) else: raise ValueError( f"when_unknown={when_unknown} is not supported" ) # pragma: no cover # add blank between words if add_blank_between_words: res += " " continue # known reading if ( add_atype and word.feature.aType is not None and word.feature.aType != "*" ): # aType is number try: aTypes = [int(aType) for aType in word.feature.aType.split(",")] except ValueError as e: warnings.warn( f"aType={word.feature.aType} is not a number, ignoring", RuntimeWarning, source=e, ) # pragma: no cover else: if len(aTypes) > 1: warnings.warn( f"aType={word.feature.aType} has multiple values, " "using the first one. " "This is expected to happen.", RuntimeWarning, ) aType = aTypes[0] if aType == 0: reading += "=" elif aType <= len(reading): reading = reading[:aType] + "]" + reading[aType:] else: warnings.warn( f"aType={aType} is too large for reading={reading} " f"of len={len(reading)}, ignoring", RuntimeWarning, ) # pragma: no cover else: # no aType pass res += reading # add blank between words if add_blank_between_words: res += " " # remove last blank if add_blank_between_words: res = res[:-1] # add newline res += "\n" # remove last newline return res[:-1]
[docs]def to_ascii_clean( text: str, reading_type: Literal["orth", "pron", "kana"] = "pron", add_atype: bool = True, add_blank_between_words: bool = True, tagger: fugashi.Tagger = _get_tagger(), remove_multiple_spaces: bool = True, ) -> str: """Convert text to reading, then to ascii. Parameters ---------- text : str The text to convert. reading_type : Literal[&quot;orth&quot;, &quot;pron&quot;, &quot;kana&quot;], optional Reading type, by default "pron" "pron" is the pronunciation (発音形), "orth" is the orthography (書字形), "kana" is the kana(仮名) form of orthography add_atype : bool, optional Whether to consider aType (アクセント型) and add "]" to the reading, by default True add_blank_between_words : bool, optional Whether to add a blank between words, by default True when_unknown : Literal[&quot;passthrough&quot;, , optional What to do when the reading is unknown ("補助記号" and "一般"), by default "passthrough" "passthrough" will pass the original text, "*" will pass "*", "unidecode" will use unidecode, and a callable will be called with the original text tagger : fugashi.Tagger, optional The tagger to use, by default fugashi.Tagger() remove_multiple_spaces : bool, optional Whether to remove multiple spaces created by unidecode, by default True Returns ------- str The ascii-cleaned text Raises ------ ImportError When unidecode is not installed Examples -------- >>> from mecab_text_cleaner import to_reading >>> to_reading(" 空、雲。\\n雨!(") 'so]ra, ku]mo. \\na]me!(' """ import unidecode text = unidecode.unidecode( to_reading( text, reading_type=reading_type, add_atype=add_atype, add_blank_between_words=add_blank_between_words, when_unknown="passthrough", tagger=tagger, ) ) if remove_multiple_spaces: text = re.sub(r" +", " ", text) return text