Source code for zope.index.text.cosineindex

##############################################################################
#
# Copyright (c) 2002 Zope Foundation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL).  A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
#
##############################################################################
"""Full text index with relevance ranking, using a cosine measure.
"""
import math

from zope.index.text.baseindex import BaseIndex
from zope.index.text.baseindex import inverse_doc_frequency



[docs]
class CosineIndex(BaseIndex):
    """
    Full text index with relevance ranking, using a cosine measure.
    """

    def __init__(self, lexicon, family=None):
        BaseIndex.__init__(self, lexicon, family=family)

        # ._wordinfo for cosine is wid -> {docid -> weight};
        # t -> D -> w(d, t)/W(d)

        # ._docweight for cosine is
        # docid -> W(docid)

    # Most of the computation for computing a relevance score for the
    # document occurs in the _search_wids() method.  The code currently
    # implements the cosine similarity function described in Managing
    # Gigabytes, eq. 4.3, p. 187.  The index_object() method
    # precomputes some values that are independent of the particular
    # query.

    # The equation is
    #
    #                     sum(for t in I(d,q): w(d,t) * w(q,t))
    #     cosine(d, q) =  -------------------------------------
    #                                  W(d) * W(q)
    #
    # where
    #    I(d, q) = the intersection of the terms in d and q.
    #
    #    w(d, t) = 1 + log f(d, t)
    #        computed by doc_term_weight(); for a given word t,
    #        self._wordinfo[t] is a map from d to w(d, t).
    #
    #    w(q, t) = log(1 + N/f(t))
    #        computed by inverse_doc_frequency()
    #
    #    W(d) = sqrt(sum(for t in d: w(d, t) ** 2))
    #        computed by _get_frequencies(), and remembered in
    #        self._docweight[d]
    #
    #    W(q) = sqrt(sum(for t in q: w(q, t) ** 2))
    #        computed by self.query_weight()

    def _search_wids(self, wids):
        if not wids:
            return []
        N = float(len(self._docweight))
        L = []
        DictType = type({})
        for wid in wids:
            assert wid in self._wordinfo  # caller responsible for OOV
            d2w = self._wordinfo[wid]  # maps docid to w(docid, wid)
            idf = inverse_doc_frequency(len(d2w), N)  # an unscaled float
            # print "idf = %.3f" % idf
            if isinstance(d2w, DictType):
                d2w = self.family.IF.Bucket(d2w)
            L.append((d2w, idf))
        return L

    def query_weight(self, terms):
        wids = []
        for term in terms:
            wids += self._lexicon.termToWordIds(term)
        N = float(len(self._docweight))
        sum = 0.0
        for wid in self._remove_oov_wids(wids):
            wt = inverse_doc_frequency(len(self._wordinfo[wid]), N)
            sum += wt ** 2.0
        return math.sqrt(sum)

    def _get_frequencies(self, wids):
        d = {}
        dget = d.get
        for wid in wids:
            d[wid] = dget(wid, 0) + 1
        Wsquares = 0.0
        for wid, count in d.items():
            w = doc_term_weight(count)
            Wsquares += w * w
            d[wid] = w
        W = math.sqrt(Wsquares)
        # print "W = %.3f" % W
        for wid, weight in d.items():
            # print i, ":", "%.3f" % weight,
            d[wid] = weight / W
            # print "->", d[wid]
        return d, W



def doc_term_weight(count):
    """Return the doc-term weight for a term that appears count times."""
    # implements w(d, t) = 1 + log f(d, t)
    return 1.0 + math.log(count)