import struct import types import cStringIO import string import os import tempfile import math import bisect import irlibdts from ipage import * #-- Version 1.1 #-- Nathan Denny, November 11, 2000 #-- Added two new methods: #-- IRDomain.termFrequency(self,docID,term) #-- IRDomain.inverseDocumentFrequency(self,term) class IRDomain: INVERTED_INDEX_PAGE_TYPE=0 DOCUMENT_PAGE_TYPE=1 ATTRIBUTE_PAGE_TYPE=2 def __init__(self,sourceFileName): self.sourceFileName=sourceFileName if os.path.exists(self.sourceFileName): self.sourceFile=open(self.sourceFileName,'rb') else: self.sourceFile=None self.pageTypes={} self.pageTypes[IRDomain.INVERTED_INDEX_PAGE_TYPE]=InvertedIndexPage self.pageTypes[IRDomain.DOCUMENT_PAGE_TYPE]=DocumentIndexPage self.pageTypes[IRDomain.ATTRIBUTE_PAGE_TYPE]=AttributeIndexPage self.indeces=Index(self.pageTypes,self.sourceFile) self.modified=0 def __del__(self): if self.modified: self.commit() def addDocuments(self,documentList): self.modified=1 I=InvertedIndexPage() D=DocumentIndexPage() A=AttributeIndexPage() docIDs=self.indeces[IRDomain.DOCUMENT_PAGE_TYPE].keys() if len(docIDs): nextDocID=max(docIDs)+1 else: nextDocID=0 for document in documentList: for term in document.terms.keys(): I.index(term,nextDocID,document.terms[term]) D[nextDocID]=document.name for attribute in document.attributes.keys(): A.setAttribute(nextDocID,attribute,document.attributes[attribute]) nextDocID=nextDocID+1 self.indeces.addIndex(IRDomain.INVERTED_INDEX_PAGE_TYPE,I) self.indeces.addIndex(IRDomain.DOCUMENT_PAGE_TYPE,D) self.indeces.addIndex(IRDomain.ATTRIBUTE_PAGE_TYPE,A) def commit(self): #-- Open a temporary file. tempFileName=tempfile.mktemp() print "Committing to %s"%tempFileName f=open(tempFileName,'wb') f.write('%s'%self.indeces) f.close() #-- Now delete the original file. if self.sourceFile: self.sourceFile.close() if os.path.exists(self.sourceFileName): os.remove(self.sourceFileName) os.rename(tempFileName,self.sourceFileName) #-- Reload the index set. self.sourceFile=open(self.sourceFileName,'rb') self.indeces=Index(self.pageTypes,self.sourceFile) #-- Unset the modified bit. self.modified=0 def getAttribute(self,docID,attribute): for indexPage in self.indeces[IRDomain.ATTRIBUTE_PAGE_TYPE].indexPages: key=indexPage.genKey(docID,attribute) if indexPage.has_key(key): return indexPage[key] return None def getName(self,docID): return self.indeces[IRDomain.DOCUMENT_PAGE_TYPE][docID] def getNames(self,docIDList): docNames=[] for docID in docIDList: docNames.append(self.indeces[IRDomain.DOCUMENT_PAGE_TYPE][docID]) return docNames def attributeQuery(self,query): resultSet=[] docIDs=self.indeces[IRDomain.DOCUMENT_PAGE_TYPE].keys() for docID in docIDs: #-- For each attribute in the query, retrieve the corresponding #-- attribute of the current document and see if it satisfies #-- the expression. If it doesn't, break out of the loop and #-- move on to the next document. satisfied=1 for queryTerm in query.keys(): value=self.getAttribute(docID,queryTerm) if type(query[queryTerm])==types.ListType: if value not in query[queryTerm]: satisfied=0 break elif type(query[queryTerm])==types.TupleType: min,max=query[queryTerm] if valuemax: satisfied=0 break else: if value!=query[queryTerm]: satisfied=0 break if satisfied: resultSet.append(docID) return resultSet def filterResultSet(self,resultSet,allowableList): filtered=[] allowableList.sort() for result in resultSet: score,docID=result i=bisect.bisect(allowableList,docID) j=i-1 if j>=0: if allowableList[j]==docID: filtered.append(result) return filtered def booleanQuery(self,terms,resultSize=100,attributeFilter=None): uTerms=[] for term in terms: uTerms.append(string.upper(string.strip(term))) docIDs={} for term in uTerms: results=self.indeces[IRDomain.INVERTED_INDEX_PAGE_TYPE][term] if results is not None: if type(results)!=types.ListType: results=[results] for result in results: docID,count=result if docIDs.has_key(docID): docIDs[docID]=docIDs[docID]+1 else: docIDs[docID]=1 scoreList=[] for docID in docIDs.keys(): scoreList.append((docIDs[docID],docID)) #-- Filter the result set if a filter is available. if attributeFilter is not None: filterSet=self.attributeQuery(attributeFilter) scoreList=self.filterResultSet(scoreList,filterSet) scoreList.sort() scoreList.reverse() return scoreList[:resultSize] def statisticalQuery(self,terms,resultSize=100,attributeFilter=None): #-- From a user's perspective, terms are case insensitive. #-- The terms are stored UPPERCASE in the index, so convert #-- all the terms to uppercase for comparison. uTerms=[] for term in terms: uTerms.append(string.upper(string.strip(term))) #-- Get the number of documents in the set. This will be used #-- rather repeatedly. docs=len(self.indeces[IRDomain.DOCUMENT_PAGE_TYPE].keys()) #-- Now, retrieve and score the results using the TF IDF method. docIDs={} for term in uTerms: results=self.indeces[IRDomain.INVERTED_INDEX_PAGE_TYPE][term] if results is not None: if type(results)!=types.ListType: results=[results] IDF=(len(results)*1.0)/docs for result in results: docID,TF=result termScore=(math.log(TF)+1)*(-math.log(IDF)) if docIDs.has_key(docID): docIDs[docID]=docIDs[docID]+termScore else: docIDs[docID]=termScore #-- Create a result set list. scoreList=[] for docID in docIDs.keys(): scoreList.append((docIDs[docID],docID)) #-- Filter the result set if a filter is available. if attributeFilter is not None: filterSet=self.attributeQuery(attributeFilter) scoreList=self.filterResultSet(scoreList,filterSet) #-- Put the highest scoring items at the beginning of the list. scoreList.sort() scoreList.reverse() return scoreList[:resultSize] #-- Returns the number of times the term is used in the given document. def termFrequency(self,thisDocID,term): Uterm=string.upper(string.strip(term)) results=self.indeces[IRDomain.INVERTED_INDEX_PAGE_TYPE][Uterm] if results is not None: if type(results)!=types.ListType: results=[results] IDF=(len(results)*1.0)/docs for result in results: docID,TF=result if docID==thisDocID: return TF return 0 #-- Returns the proportion of documents that contain at least one instance of #-- the given term. def inverseDocumentFrequency(self,term): docs=len(self.indeces[IRDomain.DOCUMENT_PAGE_TYPE].keys()) Uterm=string.upper(string.strip(term)) results=self.indeces[IRDomain.INVERTED_INDEX_PAGE_TYPE][Uterm] if results is not None: if type(results)!=types.ListType: results=[results] return (len(results)*1.0)/docs else: return 0.0 class AttributeIndexPage(IndexedPage): KEY_PREFIX_STRUCTURE="!I" KEY_PREFIX_STRUCTURE_length=struct.calcsize(KEY_PREFIX_STRUCTURE) def __init__(self,**kwargs): IndexedPage.__init__(self,kwargs) self.pageType=2 def genKey(self,docID,attribute): return '%s%s'%(struct.pack(AttributeIndexPage.KEY_PREFIX_STRUCTURE,docID),attribute) def splitKey(self,key): serializedDocID=key[:AttributeIndexPage.KEY_PREFIX_STRUCTURE_length] docID=struct.unpack(AttributeIndexPage.KEY_PREFIX_STRUCTURE,serializedDocID) attribute=key[AttributeIndexPage.KEY_PREFIX_STRUCTURE_length:] return (docID,attribute) def getAttributes(self,docID): attributeSet={} serializedKeyPrefix=struct.pack(AttributeIndexPage.KEY_PREFIX_STRUCTURE,docID) keys=self.keys() for key in keys: if key[:AttributeIndexPage.KEY_PREFIX_STRUCTURE_length]==serializedKeyPrefix: attributeSet[key[AttributeIndexPage.KEY_PREFIX_STRUCTURE_length:]]=self.__getitem__(key) return attributeSet def setAttributes(self,docID,attributeSet): for attribute in attributeSet.keys(): self.setAttribute(docID,attribute,attributeSet[attribute]) def getAttribute(self,docID,attribute): key=self.genKey(docID,attribute) if self.has_key(key): return self.__getitem__(key) else: return None def setAttribute(self,docID,attribute,value): key=self.genKey(docID,attribute) self.__setitem__(key,value) class InvertedIndexPage(IndexedPage): ENTRY_STRUCTURE="!IH" ENTRY_STRUCTURE_length=struct.calcsize(ENTRY_STRUCTURE) def __init__(self,**kwargs): IndexedPage.__init__(self,kwargs) self.pageType=0 #-- method aliases. self.terms=self.keys def index(self,term,doc_id,count): if self.storage=="memory": if self.cache.has_key(term): self.cache[term].append((doc_id,count)) else: self.cache[term]=[(doc_id,count)] else: raise Exception("Cannot modify on-disk index.") def serializeEntry(self,entryList): serializedEntries=[] for entry in entryList: doc_id,count=entry serializedEntries.append(struct.pack(InvertedIndexPage.ENTRY_STRUCTURE,doc_id,count)) return treeReduce(lambda x,y:x+y,serializedEntries) def unserializeEntry(self,entryData): entryList=[] i=0 while i=minAbsoluteCount and relativeContainment<=maxRelativeContainment: I[term]=hits #-- Merge the document pages. print "Merging document indeces" D=DocumentIndexPage() for docID in docIDs: D[docID]=IRindex.indeces[IRDomain.DOCUMENT_PAGE_TYPE][docID] #-- Merge the attribute pages. print "Merging attribute indeces" A=AttributeIndexPage() attributeKeys=IRindex.indeces[IRDomain.ATTRIBUTE_PAGE_TYPE].keys() for attributeKey in attributeKeys: A[attributeKey]=IRindex.indeces[IRDomain.ATTRIBUTE_PAGE_TYPE][attributeKey] #-- Create a new index! print "Building new index" IRindex2=IRDomain(optimizedFileName) IRindex2.indeces.addIndex(IRDomain.INVERTED_INDEX_PAGE_TYPE, I) IRindex2.indeces.addIndex(IRDomain.DOCUMENT_PAGE_TYPE, D) IRindex2.indeces.addIndex(IRDomain.ATTRIBUTE_PAGE_TYPE, A) IRindex2.commit()