infosearch

08: 検索モデル その2-確率モデル-

確率を利用したランキング

テキスト分類

ナイーブベイズモデル

利点

欠点

ベイズ分類

ネガポジ分類

NLTK

import random
from typing import Any, Dict, List, Literal, Tuple, Union

import nltk

try:
    nltk.data.find('corpora/movie_reviews')
except LookupError:
    nltk.download('movie_reviews')

from nltk.corpus import movie_reviews as mr

MR_ALL_WORDS = nltk.FreqDist(w.lower()
                             for w in mr.words())
WORD_FEATURES = list(MR_ALL_WORDS.keys())[:2000]


def doc_features(doc) -> Dict[str, bool]:
    """素性抽出器"""
    doc_words = set(doc)
    features = {}
    for w in WORD_FEATURES:
        features['contains(%s)' % w] = w in doc_words
    return features


AccResults = Tuple[Union[Any, float, Literal[0]],
                   nltk.NaiveBayesClassifier]


def cal_accuracy(docs: List[Tuple[list, str]]) -> AccResults:
    """正解率計算"""
    feat_sets = [(doc_features(d), c) for d, c in docs]
    train_set, time_set = feat_sets[100:], feat_sets[:100]
    classifier = nltk.NaiveBayesClassifier.train(train_set)
    return (nltk.classify.accuracy(classifier, time_set), classifier,)


def main() -> None:
    docs = [(list(mr.words(id_)), category,)
            for category in mr.categories()
            for id_ in mr.fileids(category)]
    random.shuffle(docs)
    acc, clsfier = cal_accuracy(docs)
    print("Accuracy:", acc)
    clsfier.show_most_informative_features(5)


if __name__ == '__main__':
    main()

ベイズ分類

P(D|R)の推定

二項独立モデル

BM25

bm25

言語モデル

Web 検索と機械学習による検索モデル