Package translate :: Package search :: Package indexing :: Module XapianIndexer
[hide private]
[frames] | no frames]

Source Code for Module translate.search.indexing.XapianIndexer

  1  #!/usr/bin/env python 
  2  # -*- coding: utf-8 -*- 
  3  # 
  4  # Copyright 2008 Zuza Software Foundation 
  5  #  
  6  # This file is part of translate. 
  7  # 
  8  # translate is free software; you can redistribute it and/or modify 
  9  # it under the terms of the GNU General Public License as published by 
 10  # the Free Software Foundation; either version 2 of the License, or 
 11  # (at your option) any later version. 
 12  #  
 13  # translate is distributed in the hope that it will be useful, 
 14  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 15  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 16  # GNU General Public License for more details. 
 17  # 
 18  # You should have received a copy of the GNU General Public License 
 19  # along with translate; if not, write to the Free Software 
 20  # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA 
 21  # 
 22   
 23  """ 
 24  interface to the xapian indexing engine for the translate toolkit 
 25   
 26  Xapian v1.0 or higher is supported. 
 27   
 28  If you are interested in writing an interface for Xapian 0.x, then 
 29  you should checkout the following:: 
 30      svn export -r 7235 https://translate.svn.sourceforge.net/svnroot/translate/src/branches/translate-search-indexer-generic-merging/translate/search/indexer/ 
 31  It is not completely working, but it should give you a good start. 
 32  """ 
 33   
 34  __revision__ = "$Id: XapianIndexer.py 10834 2009-04-09 14:40:06Z alaaosh $" 
 35   
 36  # xapian module hangs apache under mod_python 
 37  # detect if running under apache and fail immediatly 
 38  import sys 
 39  if 'apache' in sys.modules or '_apache' in sys.modules: 
 40      raise ImportError("Running under mod_python, can't load xapian") 
 41   
 42  import CommonIndexer 
 43  import xapian 
 44  import os 
 45  import re 
 46   
 47   
48 -def is_available():
49 return xapian.major_version() > 0
50 51 52 # in xapian there is a length restriction for term strings 53 # see http://osdir.com/ml/search.xapian.general/2006-11/msg00210.html 54 # a maximum length of around 240 is described there - but we need less anyway 55 _MAX_TERM_LENGTH = 128 56 57
58 -class XapianDatabase(CommonIndexer.CommonDatabase):
59 """interface to the xapian (http://xapian.org) indexer 60 """ 61 62 QUERY_TYPE = xapian.Query 63 INDEX_DIRECTORY_NAME = "xapian" 64
65 - def __init__(self, basedir, analyzer=None, create_allowed=True):
66 """initialize or open a xapian database 67 68 @raise ValueError: the given location exists, but the database type 69 is incompatible (e.g. created by a different indexing engine) 70 @raise OSError: the database failed to initialize 71 72 @param basedir: the parent directory of the database 73 @type basedir: str 74 @param analyzer: bitwise combination of possible analyzer flags 75 to be used as the default analyzer for this database. Leave it empty 76 to use the system default analyzer (self.ANALYZER_DEFAULT). 77 see self.ANALYZER_TOKENIZE, self.ANALYZER_PARTIAL, ... 78 @type analyzer: int 79 @param create_allowed: create the database, if necessary; default: True 80 @type create_allowed: bool 81 """ 82 # call the __init__ function of our parent 83 super(XapianDatabase, self).__init__(basedir, analyzer=analyzer, 84 create_allowed=create_allowed) 85 if os.path.exists(self.location): 86 # try to open an existing database 87 try: 88 self.database = xapian.WritableDatabase(self.location, 89 xapian.DB_OPEN) 90 except xapian.DatabaseOpeningError, err_msg: 91 raise ValueError("Indexer: failed to open xapian database " \ 92 + "(%s) - maybe it is not a xapian database: %s" \ 93 % (self.location, err_msg)) 94 else: 95 # create a new database 96 if not create_allowed: 97 raise OSError("Indexer: skipping database creation") 98 try: 99 # create the parent directory if it does not exist 100 parent_path = os.path.dirname(self.location) 101 if not os.path.isdir(parent_path): 102 # recursively create all directories up to parent_path 103 os.makedirs(parent_path) 104 except IOError, err_msg: 105 raise OSError("Indexer: failed to create the parent " \ 106 + "directory (%s) of the indexing database: %s" \ 107 % (parent_path, err_msg)) 108 try: 109 self.database = xapian.WritableDatabase(self.location, 110 xapian.DB_CREATE_OR_OPEN) 111 except xapian.DatabaseOpeningError, err_msg: 112 raise OSError("Indexer: failed to open or create a xapian " \ 113 + "database (%s): %s" % (self.location, err_msg))
114
115 - def flush(self, optimize=False):
116 """force to write the current changes to disk immediately 117 118 @param optimize: ignored for xapian 119 @type optimize: bool 120 """ 121 # write changes to disk (only if database is read-write) 122 if (isinstance(self.database, xapian.WritableDatabase)): 123 self.database.flush() 124 # free the database to remove locks - this is a xapian-specific issue 125 self.database = None 126 # reopen it as read-only 127 self._prepare_database()
128
129 - def _create_query_for_query(self, query):
130 """generate a query based on an existing query object 131 132 basically this function should just create a copy of the original 133 134 @param query: the original query object 135 @type query: xapian.Query 136 @return: the resulting query object 137 @rtype: xapian.Query 138 """ 139 # create a copy of the original query 140 return xapian.Query(query)
141
142 - def _create_query_for_string(self, text, require_all=True, 143 analyzer=None):
144 """generate a query for a plain term of a string query 145 146 basically this function parses the string and returns the resulting 147 query 148 149 @param text: the query string 150 @type text: str 151 @param require_all: boolean operator 152 (True -> AND (default) / False -> OR) 153 @type require_all: bool 154 @param analyzer: Define query options (partial matching, exact matching, 155 tokenizing, ...) as bitwise combinations of 156 CommonIndexer.ANALYZER_???. 157 This can override previously defined field analyzer settings. 158 If analyzer is None (default), then the configured analyzer for the 159 field is used. 160 @type analyzer: int 161 @return: resulting query object 162 @rtype: xapian.Query 163 """ 164 qp = xapian.QueryParser() 165 qp.set_database(self.database) 166 if require_all: 167 qp.set_default_op(xapian.Query.OP_AND) 168 else: 169 qp.set_default_op(xapian.Query.OP_OR) 170 if analyzer is None: 171 analyzer = self.analyzer 172 if analyzer & self.ANALYZER_PARTIAL > 0: 173 match_flags = xapian.QueryParser.FLAG_PARTIAL 174 return qp.parse_query(text, match_flags) 175 elif analyzer == self.ANALYZER_EXACT: 176 # exact matching - 177 return xapian.Query(text) 178 else: 179 # everything else (not partial and not exact) 180 match_flags = 0 181 return qp.parse_query(text, match_flags)
182
183 - def _create_query_for_field(self, field, value, analyzer=None):
184 """generate a field query 185 186 this functions creates a field->value query 187 188 @param field: the fieldname to be used 189 @type field: str 190 @param value: the wanted value of the field 191 @type value: str 192 @param analyzer: Define query options (partial matching, exact matching, 193 tokenizing, ...) as bitwise combinations of 194 CommonIndexer.ANALYZER_???. 195 This can override previously defined field analyzer settings. 196 If analyzer is None (default), then the configured analyzer for the 197 field is used. 198 @type analyzer: int 199 @return: the resulting query object 200 @rtype: xapian.Query 201 """ 202 if analyzer is None: 203 analyzer = self.analyzer 204 if analyzer == self.ANALYZER_EXACT: 205 # exact matching -> keep special characters 206 return xapian.Query("%s%s" % (field.upper(), value)) 207 # other queries need a parser object 208 qp = xapian.QueryParser() 209 qp.set_database(self.database) 210 if (analyzer & self.ANALYZER_PARTIAL > 0): 211 # partial matching 212 match_flags = xapian.QueryParser.FLAG_PARTIAL 213 return qp.parse_query(value, match_flags, field.upper()) 214 else: 215 # everything else (not partial and not exact) 216 match_flags = 0 217 return qp.parse_query(value, match_flags, field.upper())
218
219 - def _create_query_combined(self, queries, require_all=True):
220 """generate a combined query 221 222 @param queries: list of the original queries 223 @type queries: list of xapian.Query 224 @param require_all: boolean operator 225 (True -> AND (default) / False -> OR) 226 @type require_all: bool 227 @return: the resulting combined query object 228 @rtype: xapian.Query 229 """ 230 if require_all: 231 query_op = xapian.Query.OP_AND 232 else: 233 query_op = xapian.Query.OP_OR 234 return xapian.Query(query_op, queries)
235
236 - def _create_empty_document(self):
237 """create an empty document to be filled and added to the index later 238 239 @return: the new document object 240 @rtype: xapian.Document 241 """ 242 return xapian.Document()
243
244 - def _add_plain_term(self, document, term, tokenize=True):
245 """add a term to a document 246 247 @param document: the document to be changed 248 @type document: xapian.Document 249 @param term: a single term to be added 250 @type term: str 251 @param tokenize: should the term be tokenized automatically 252 @type tokenize: bool 253 """ 254 if tokenize: 255 term_gen = xapian.TermGenerator() 256 term_gen.set_document(document) 257 term_gen.index_text(term) 258 else: 259 document.add_term(_truncate_term_length(term))
260
261 - def _add_field_term(self, document, field, term, tokenize=True):
262 """add a field term to a document 263 264 @param document: the document to be changed 265 @type document: xapian.Document 266 @param field: name of the field 267 @type field: str 268 @param term: term to be associated to the field 269 @type term: str 270 @param tokenize: should the term be tokenized automatically 271 @type tokenize: bool 272 """ 273 if tokenize: 274 term_gen = xapian.TermGenerator() 275 term_gen.set_document(document) 276 term_gen.index_text(term, 1, field.upper()) 277 else: 278 document.add_term(_truncate_term_length("%s%s" % \ 279 (field.upper(), term)))
280
281 - def _add_document_to_index(self, document):
282 """add a prepared document to the index database 283 284 @param document: the document to be added 285 @type document: xapian.Document 286 """ 287 # open the database for writing 288 self._prepare_database(writable=True) 289 self.database.add_document(document)
290
291 - def begin_transaction(self):
292 """begin a transaction 293 294 Xapian supports transactions to group multiple database modifications. 295 This avoids intermediate flushing and therefore increases performance. 296 """ 297 self._prepare_database(writable=True) 298 self.database.begin_transaction()
299
300 - def cancel_transaction(self):
301 """cancel an ongoing transaction 302 303 no changes since the last execution of 'begin_transcation' are written 304 """ 305 self._prepare_database(writable=True) 306 self.database.cancel_transaction()
307
308 - def commit_transaction(self):
309 """submit the changes of an ongoing transaction 310 311 all changes since the last execution of 'begin_transaction' are written 312 """ 313 self._prepare_database(writable=True) 314 self.database.commit_transaction()
315
316 - def get_query_result(self, query):
317 """return an object containing the results of a query 318 319 @param query: a pre-compiled xapian query 320 @type query: xapian.Query 321 @return: an object that allows access to the results 322 @rtype: XapianIndexer.CommonEnquire 323 """ 324 enquire = xapian.Enquire(self.database) 325 enquire.set_query(query) 326 return XapianEnquire(enquire)
327
328 - def delete_document_by_id(self, docid):
329 """delete a specified document 330 331 @param docid: the document ID to be deleted 332 @type docid: int 333 """ 334 # open the database for writing 335 self._prepare_database(writable=True) 336 try: 337 self.database.delete_document(docid) 338 return True 339 except xapian.DocNotFoundError: 340 return False
341
342 - def search(self, query, fieldnames):
343 """return a list of the contents of specified fields for all matches of 344 a query 345 346 @param query: the query to be issued 347 @type query: xapian.Query 348 @param fieldnames: the name(s) of a field of the document content 349 @type fieldnames: string | list of strings 350 @return: a list of dicts containing the specified field(s) 351 @rtype: list of dicts 352 """ 353 result = [] 354 if isinstance(fieldnames, basestring): 355 fieldnames = [fieldnames] 356 self._walk_matches(query, _extract_fieldvalues, (result, fieldnames)) 357 return result
358
359 - def _prepare_database(self, writable=False):
360 """reopen the database as read-only or as writable if necessary 361 362 this fixes a xapian specific issue regarding open locks for 363 writable databases 364 365 @param writable: True for opening a writable database 366 @type writable: bool 367 """ 368 if writable and (not isinstance(self.database, 369 xapian.WritableDatabase)): 370 self.database = xapian.WritableDatabase(self.location, 371 xapian.DB_OPEN) 372 elif not writable and (not isinstance(self.database, xapian.Database)): 373 self.database = xapian.Database(self.location)
374 375
376 -class XapianEnquire(CommonIndexer.CommonEnquire):
377 """interface to the xapian object for storing sets of matches 378 """ 379
380 - def get_matches(self, start, number):
381 """return a specified number of qualified matches of a previous query 382 383 @param start: index of the first match to return (starting from zero) 384 @type start: int 385 @param number: the number of matching entries to return 386 @type number: int 387 @return: a set of matching entries and some statistics 388 @rtype: tuple of (returned number, available number, matches) 389 "matches" is a dictionary of:: 390 ["rank", "percent", "document", "docid"] 391 """ 392 matches = self.enquire.get_mset(start, number) 393 result = [] 394 for match in matches: 395 elem = {} 396 elem["rank"] = match[xapian.MSET_RANK] 397 elem["docid"] = match[xapian.MSET_DID] 398 elem["percent"] = match[xapian.MSET_PERCENT] 399 elem["document"] = match[xapian.MSET_DOCUMENT] 400 result.append(elem) 401 return (matches.size(), matches.get_matches_estimated(), result)
402 403
404 -def _truncate_term_length(term, taken=0):
405 """truncate the length of a term string length to the maximum allowed 406 for xapian terms 407 408 @param term: the value of the term, that should be truncated 409 @type term: str 410 @param taken: since a term consists of the name of the term and its 411 actual value, this additional parameter can be used to reduce the 412 maximum count of possible characters 413 @type taken: int 414 @return: the truncated string 415 @rtype: str 416 """ 417 if len(term) > _MAX_TERM_LENGTH - taken: 418 return term[0:_MAX_TERM_LENGTH - taken - 1] 419 else: 420 return term
421
422 -def _extract_fieldvalues(match, (result, fieldnames)):
423 """add a dict of field values to a list 424 425 usually this function should be used together with '_walk_matches' 426 for traversing a list of matches 427 @param match: a single match object 428 @type match: xapian.MSet 429 @param result: the resulting dict will be added to this list 430 @type result: list of dict 431 @param fieldnames: the names of the fields to be added to the dict 432 @type fieldnames: list of str 433 """ 434 # prepare empty dict 435 item_fields = {} 436 # fill the dict 437 for term in match["document"].termlist(): 438 for fname in fieldnames: 439 if ((fname is None) and re.match("[^A-Z]", term.term)): 440 value = term.term 441 elif re.match("%s[^A-Z]" % str(fname).upper(), term.term): 442 value = term.term[len(fname):] 443 else: 444 continue 445 # we found a matching field/term 446 if item_fields.has_key(fname): 447 item_fields[fname].append(value) 448 else: 449 item_fields[fname] = [value] 450 result.append(item_fields)
451