infosearch

04: 索引付け

ページランク

実装

import dgl
import networkx as nx
import numpy as np
import torch
from dgl import function as fn
from matplotlib import pyplot as plt


def build_simple_graph(show_plt: bool = False) -> dgl.DGLGraph:
    g = dgl.DGLGraph()
    g.add_nodes(3)
    src = np.array([0, 0, 1, 2])
    dst = np.array([1, 2, 2, 0])
    g.add_edges(src, dst)
    print("node:", g.number_of_nodes())
    print("edge:", g.number_of_edges())
    nx.draw(g.to_networkx(), node_size=3)
    if show_plt:
        plt.show()
    return g


def pagerank_builtin(g: dgl.DGLGraph, damp: float = 0.85) -> None:
    g.ndata['pv'] /= g.ndata['deg']
    g.update_all(message_func=fn.copy_src(src='pv', out='m'),
                 reduce_func=fn.sum(msg='m', out='m_sum'))
    N = g.number_of_nodes()
    g.ndata['pv'] = (1 - damp) / N + damp * g.ndata['m_sum']


def main() -> None:
    F = build_simple_graph()
    node_cnt = F.number_of_nodes()
    F.ndata['pv'] = torch.ones(node_cnt) / node_cnt
    F.ndata['deg'] = F.out_degrees(F.nodes()).float()

    print("init vals:", F.ndata['pv'])
    for _ in range(3):
        pagerank_builtin(F)
        print('step %d:' % _, F.ndata['pv'])


if __name__ == '__main__':
    main()
$ python pagerank.py
Using backend: pytorch
node: 3
edge: 4
init vals: tensor([0.3333, 0.3333, 0.3333]) # 初期値1/3
step 0: tensor([0.3333, 0.1917, 0.4750])
step 1: tensor([0.4538, 0.1917, 0.3546])
step 2: tensor([0.3514, 0.2428, 0.4058])

索引

索引の転置

フィールドと索引付け

N-gram

text="おはよう"
p text.chars.each_cons(2).map(&:join) #=>["おは", "はよ", "よう"]

部分文字列検索

text="さくらさくらいま"
(0..._=text.size).map{text[_1,_]}.sort
#=> ["いま", "くらいま", "くらさくらいま", "さくらいま",
#    "さくらさくらいま", "ま", "らいま", "らさくらいま"]

索引の圧縮

なんで圧縮?

基本的な考え方

圧縮手法

cal_d_gap=->inv{inv.each_cons(2).map{_1.inject(:-).abs}}
p cal_d_gap[[1, 5, 9, 18, 23, 24, 30, 44, 45, 48]]
#=> [4, 4, 9, 5, 1, 6, 14, 1, 3]

スキップ操作

# sorted
inv_lst = [1, 5, 9, 18, 23, 24, 30, 44, 45, 48]
mk_v_dgaps =->inv{
              inv.each_cons(2)
              .map{_1.inject(:-).abs}}
mk_p_skips =->inv,n{
              [*inv.each_with_index]
              .each_slice(n)
              .map(&:first)[1..]}
search_d  =->d,gaps,skips{
             _m=skips[0][1]
             _d,_p=skips.filter{|a,_|a<=d}[-1]
             (_p..._e=_p+_m).map{|i|
               _d==d ? (return i;break) :
               i==_e ? (return -1;break): (_d+=gaps[i])}}
search_d[45,
         mk_v_dgaps[inv_lst],
         mk_p_skips[inv_lst,3]]
#=> 8

索引のマージ

索引付けの分散化

MapReduce