Package translate :: Package search :: Package indexing :: Module PyLuceneIndexer
[hide private]
[frames] | no frames]

Source Code for Module translate.search.indexing.PyLuceneIndexer

  1  # -*- coding: utf-8 -*- 
  2  # 
  3  # Copyright 2008 Zuza Software Foundation 
  4  #  
  5  # This file is part of translate. 
  6  # 
  7  # translate is free software; you can redistribute it and/or modify 
  8  # it under the terms of the GNU General Public License as published by 
  9  # the Free Software Foundation; either version 2 of the License, or 
 10  # (at your option) any later version. 
 11  #  
 12  # translate is distributed in the hope that it will be useful, 
 13  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 14  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 15  # GNU General Public License for more details. 
 16  # 
 17  # You should have received a copy of the GNU General Public License 
 18  # along with translate; if not, write to the Free Software 
 19  # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA 
 20  # 
 21   
 22   
 23  """ 
 24  interface for the PyLucene (v2.x) indexing engine 
 25   
 26  take a look at PyLuceneIndexer1.py for the PyLucene v1.x interface 
 27  """ 
 28   
 29  __revision__ = "$Id: PyLuceneIndexer.py 11946 2009-07-19 23:59:10Z alaaosh $" 
 30   
 31  import CommonIndexer 
 32  # TODO: replace this dependency on the jToolkit 
 33  #import jToolkit.glock 
 34  import tempfile 
 35  import re 
 36  import os 
 37  import time 
 38   
 39  # try to import the PyLucene package (with the two possible names) 
 40  # remember the type of the detected package (compiled with jcc (>=v2.3) or 
 41  # with gcj (<=v2.2) 
 42  try: 
 43      import PyLucene 
 44      _COMPILER = 'gcj' 
 45  except ImportError: 
 46      # if this fails, then there is no pylucene installed 
 47      import lucene 
 48      PyLucene = lucene 
 49      PyLucene.initVM(PyLucene.CLASSPATH) 
 50      _COMPILER = 'jcc' 
 51   
 52   
 53  UNNAMED_FIELD_NAME = "FieldWithoutAName" 
 54  MAX_FIELD_SIZE = 1048576 
 55   
 56   
57 -def is_available():
58 return _get_pylucene_version() == 2
59 60
61 -class PyLuceneDatabase(CommonIndexer.CommonDatabase):
62 """manage and use a pylucene indexing database""" 63 64 QUERY_TYPE = PyLucene.Query 65 INDEX_DIRECTORY_NAME = "lucene" 66
67 - def __init__(self, basedir, analyzer=None, create_allowed=True):
68 """initialize or open an indexing database 69 70 Any derived class must override __init__. 71 72 @raise ValueError: the given location exists, but the database type 73 is incompatible (e.g. created by a different indexing engine) 74 @raise OSError: the database failed to initialize 75 76 @param basedir: the parent directory of the database 77 @type basedir: str 78 @param analyzer: bitwise combination of possible analyzer flags 79 to be used as the default analyzer for this database. Leave it empty 80 to use the system default analyzer (self.ANALYZER_DEFAULT). 81 see self.ANALYZER_TOKENIZE, self.ANALYZER_PARTIAL, ... 82 @type analyzer: int 83 @param create_allowed: create the database, if necessary; default: True 84 @type create_allowed: bool 85 """ 86 super(PyLuceneDatabase, self).__init__(basedir, analyzer=analyzer, 87 create_allowed=create_allowed) 88 self.pyl_analyzer = PyLucene.StandardAnalyzer() 89 self.writer = None 90 self.reader = None 91 self.index_version = None 92 try: 93 # try to open an existing database 94 tempreader = PyLucene.IndexReader.open(self.location) 95 tempreader.close() 96 except PyLucene.JavaError, err_msg: 97 # Write an error out, in case this is a real problem instead of an absence of an index 98 # TODO: turn the following two lines into debug output 99 #errorstr = str(e).strip() + "\n" + self.errorhandler.traceback_str() 100 #DEBUG_FOO("could not open index, so going to create: " + errorstr) 101 # Create the index, so we can open cached readers on it 102 if not create_allowed: 103 raise OSError("Indexer: skipping database creation") 104 try: 105 # create the parent directory if it does not exist 106 parent_path = os.path.dirname(self.location) 107 if not os.path.isdir(parent_path): 108 # recursively create all directories up to parent_path 109 os.makedirs(parent_path) 110 except IOError, err_msg: 111 raise OSError("Indexer: failed to create the parent " \ 112 + "directory (%s) of the indexing database: %s" \ 113 % (parent_path, err_msg)) 114 try: 115 tempwriter = PyLucene.IndexWriter(self.location, 116 self.pyl_analyzer, True) 117 tempwriter.close() 118 except PyLucene.JavaError, err_msg: 119 raise OSError("Indexer: failed to open or create a Lucene" \ 120 + " database (%s): %s" % (self.location, err_msg)) 121 # the indexer is initialized - now we prepare the searcher 122 # create a lock for the database directory - to be used later 123 lockname = os.path.join(tempfile.gettempdir(), 124 re.sub("\W", "_", self.location)) 125 #self.dir_lock = jToolkit.glock.GlobalLock(lockname) 126 # windows file locking seems inconsistent, so we try 10 times 127 numtries = 0 128 #self.dir_lock.acquire(blocking=True) 129 # read "self.reader", "self.indexVersion" and "self.searcher" 130 try: 131 while numtries < 10: 132 try: 133 self.reader = PyLucene.IndexReader.open(self.location) 134 self.indexVersion = self.reader.getCurrentVersion( 135 self.location) 136 self.searcher = PyLucene.IndexSearcher(self.reader) 137 break 138 except PyLucene.JavaError, e: 139 # store error message for possible later re-raise (below) 140 lock_error_msg = e 141 time.sleep(0.01) 142 numtries += 1 143 else: 144 # locking failed for 10 times 145 raise OSError("Indexer: failed to lock index database" \ 146 + " (%s)" % lock_error_msg) 147 finally: 148 pass 149 # self.dir_lock.release() 150 # initialize the searcher and the reader 151 self._index_refresh()
152
153 - def __del__(self):
154 """remove lock and close writer after loosing the last reference""" 155 self._writer_close()
156
157 - def flush(self, optimize=False):
158 """flush the content of the database - to force changes to be written 159 to disk 160 161 some databases also support index optimization 162 163 @param optimize: should the index be optimized if possible? 164 @type optimize: bool 165 """ 166 if self._writer_is_open(): 167 try: 168 if optimize: 169 self.writer.optimize() 170 finally: 171 # close the database even if optimizing failed 172 self._writer_close() 173 # the reader/searcher needs an update, too 174 self._index_refresh()
175
176 - def _create_query_for_query(self, query):
177 """generate a query based on an existing query object 178 179 basically this function should just create a copy of the original 180 181 @param query: the original query object 182 @type query: PyLucene.Query 183 @return: resulting query object 184 @rtype: PyLucene.Query 185 """ 186 # TODO: a deep copy or a clone would be safer 187 # somehow not working (returns "null"): copy.deepcopy(query) 188 return query
189
190 - def _create_query_for_string(self, text, require_all=True, 191 analyzer=None):
192 """generate a query for a plain term of a string query 193 194 basically this function parses the string and returns the resulting 195 query 196 197 @param text: the query string 198 @type text: str 199 @param require_all: boolean operator 200 (True -> AND (default) / False -> OR) 201 @type require_all: bool 202 @param analyzer: the analyzer to be used 203 possible analyzers are: 204 - L{CommonDatabase.ANALYZER_TOKENIZE} 205 the field value is splitted to be matched word-wise 206 - L{CommonDatabase.ANALYZER_PARTIAL} 207 the field value must start with the query string 208 - L{CommonDatabase.ANALYZER_EXACT} 209 keep special characters and the like 210 @type analyzer: bool 211 @return: resulting query object 212 @rtype: PyLucene.Query 213 """ 214 if analyzer is None: 215 analyzer = self.analyzer 216 if analyzer == self.ANALYZER_EXACT: 217 analyzer_obj = PyLucene.KeywordAnalyzer() 218 else: 219 text = _escape_term_value(text) 220 analyzer_obj = PyLucene.StandardAnalyzer() 221 qp = PyLucene.QueryParser(UNNAMED_FIELD_NAME, analyzer_obj) 222 if (analyzer & self.ANALYZER_PARTIAL > 0): 223 # PyLucene uses explicit wildcards for partial matching 224 text += "*" 225 if require_all: 226 qp.setDefaultOperator(qp.Operator.AND) 227 else: 228 qp.setDefaultOperator(qp.Operator.OR) 229 return qp.parse(text)
230
231 - def _create_query_for_field(self, field, value, analyzer=None):
232 """generate a field query 233 234 this functions creates a field->value query 235 236 @param field: the fieldname to be used 237 @type field: str 238 @param value: the wanted value of the field 239 @type value: str 240 @param analyzer: the analyzer to be used 241 possible analyzers are: 242 - L{CommonDatabase.ANALYZER_TOKENIZE} 243 the field value is splitted to be matched word-wise 244 - L{CommonDatabase.ANALYZER_PARTIAL} 245 the field value must start with the query string 246 - L{CommonDatabase.ANALYZER_EXACT} 247 keep special characters and the like 248 @type analyzer: bool 249 @return: resulting query object 250 @rtype: PyLucene.Query 251 """ 252 if analyzer is None: 253 analyzer = self.analyzer 254 if analyzer == self.ANALYZER_EXACT: 255 analyzer_obj = PyLucene.KeywordAnalyzer() 256 else: 257 value = _escape_term_value(value) 258 analyzer_obj = PyLucene.StandardAnalyzer() 259 qp = PyLucene.QueryParser(field, analyzer_obj) 260 if (analyzer & self.ANALYZER_PARTIAL > 0): 261 # PyLucene uses explicit wildcards for partial matching 262 value += "*" 263 return qp.parse(value)
264
265 - def _create_query_combined(self, queries, require_all=True):
266 """generate a combined query 267 268 @param queries: list of the original queries 269 @type queries: list of PyLucene.Query 270 @param require_all: boolean operator 271 (True -> AND (default) / False -> OR) 272 @type require_all: bool 273 @return: the resulting combined query object 274 @rtype: PyLucene.Query 275 """ 276 combined_query = PyLucene.BooleanQuery() 277 for query in queries: 278 combined_query.add( 279 PyLucene.BooleanClause(query, _occur(require_all, False))) 280 return combined_query
281
282 - def _create_empty_document(self):
283 """create an empty document to be filled and added to the index later 284 285 @return: the new document object 286 @rtype: PyLucene.Document 287 """ 288 return PyLucene.Document()
289
290 - def _add_plain_term(self, document, term, tokenize=True):
291 """add a term to a document 292 293 @param document: the document to be changed 294 @type document: PyLucene.Document 295 @param term: a single term to be added 296 @type term: str 297 @param tokenize: should the term be tokenized automatically 298 @type tokenize: bool 299 """ 300 if tokenize: 301 token_flag = PyLucene.Field.Index.TOKENIZED 302 else: 303 token_flag = PyLucene.Field.Index.UN_TOKENIZED 304 document.add(PyLucene.Field(str(UNNAMED_FIELD_NAME), term, 305 PyLucene.Field.Store.YES, token_flag))
306
307 - def _add_field_term(self, document, field, term, tokenize=True):
308 """add a field term to a document 309 310 @param document: the document to be changed 311 @type document: PyLucene.Document 312 @param field: name of the field 313 @type field: str 314 @param term: term to be associated to the field 315 @type term: str 316 @param tokenize: should the term be tokenized automatically 317 @type tokenize: bool 318 """ 319 if tokenize: 320 token_flag = PyLucene.Field.Index.TOKENIZED 321 else: 322 token_flag = PyLucene.Field.Index.UN_TOKENIZED 323 document.add(PyLucene.Field(str(field), term, 324 PyLucene.Field.Store.YES, token_flag))
325
326 - def _add_document_to_index(self, document):
327 """add a prepared document to the index database 328 329 @param document: the document to be added 330 @type document: PyLucene.Document 331 """ 332 self._writer_open() 333 self.writer.addDocument(document)
334
335 - def begin_transaction(self):
336 """PyLucene does not support transactions 337 338 Thus this function just opens the database for write access. 339 Call "cancel_transaction" or "commit_transaction" to close write 340 access in order to remove the exclusive lock from the database 341 directory. 342 """ 343 self._writer_open()
344
345 - def cancel_transaction(self):
346 """PyLucene does not support transactions 347 348 Thus this function just closes the database write access and removes 349 the exclusive lock. 350 351 See 'start_transaction' for details. 352 """ 353 self._writer_close()
354
355 - def commit_transaction(self):
356 """PyLucene does not support transactions 357 358 Thus this function just closes the database write access and removes 359 the exclusive lock. 360 361 See 'start_transaction' for details. 362 """ 363 self._writer_close() 364 self._index_refresh()
365
366 - def get_query_result(self, query):
367 """return an object containing the results of a query 368 369 @param query: a pre-compiled query 370 @type query: a query object of the real implementation 371 @return: an object that allows access to the results 372 @rtype: subclass of CommonEnquire 373 """ 374 return PyLuceneHits(self.searcher.search(query))
375
376 - def delete_document_by_id(self, docid):
377 """delete a specified document 378 379 @param docid: the document ID to be deleted 380 @type docid: int 381 """ 382 self.reader.deleteDocument(docid) 383 # TODO: check the performance impact of calling "refresh" for each id 384 self._index_refresh()
385
386 - def search(self, query, fieldnames):
387 """return a list of the contents of specified fields for all matches of 388 a query 389 390 @param query: the query to be issued 391 @type query: a query object of the real implementation 392 @param fieldnames: the name(s) of a field of the document content 393 @type fieldnames: string | list of strings 394 @return: a list of dicts containing the specified field(s) 395 @rtype: list of dicts 396 """ 397 if isinstance(fieldnames, basestring): 398 fieldnames = [fieldnames] 399 hits = self.searcher.search(query) 400 if _COMPILER == 'jcc': 401 # add the ranking number and the retrieved document to the array 402 hits = [(hit, hits.doc(hit)) for hit in range(hits.length())] 403 result = [] 404 for hit, doc in hits: 405 fields = {} 406 for fieldname in fieldnames: 407 # take care for the special field "None" 408 if fieldname is None: 409 pyl_fieldname = UNNAMED_FIELD_NAME 410 else: 411 pyl_fieldname = fieldname 412 fields[fieldname] = doc.getValues(pyl_fieldname) 413 result.append(fields) 414 return result
415
416 - def _writer_open(self):
417 """open write access for the indexing database and acquire an 418 exclusive lock 419 """ 420 if not self._writer_is_open(): 421 #self.dir_lock.acquire() 422 self.writer = PyLucene.IndexWriter(self.location, self.pyl_analyzer, 423 False) 424 # "setMaxFieldLength" is available since PyLucene v2 425 # we must stay compatible to v1 for the derived class 426 # (PyLuceneIndexer1) - thus we make this step optional 427 if hasattr(self.writer, "setMaxFieldLength"): 428 self.writer.setMaxFieldLength(MAX_FIELD_SIZE)
429 # do nothing, if it is already open 430
431 - def _writer_close(self):
432 """close indexing write access and remove the database lock""" 433 if self._writer_is_open(): 434 self.writer.close() 435 self.writer = None
436 # make sure that the lock is removed 437 #self.dir_lock.forcerelease() 438
439 - def _writer_is_open(self):
440 """check if the indexing write access is currently open""" 441 return not self.writer is None
442
443 - def _index_refresh(self):
444 """re-read the indexer database""" 445 #try: 446 #self.dir_lock.acquire(blocking=False) 447 #except jToolkit.glock.GlobalLockError, e: 448 # if this fails the index is being rewritten, so we continue with 449 # our old version 450 # return 451 try: 452 if self.reader is None or self.searcher is None: 453 self.reader = PyLucene.IndexReader.open(self.location) 454 self.searcher = PyLucene.IndexSearcher(self.reader) 455 elif self.index_version != self.reader.getCurrentVersion( \ 456 self.location): 457 self.searcher.close() 458 self.reader.close() 459 self.reader = PyLucene.IndexReader.open(self.location) 460 self.searcher = PyLucene.IndexSearcher(self.reader) 461 self.index_version = self.reader.getCurrentVersion(self.location) 462 except PyLucene.JavaError,e: 463 # TODO: add some debugging output? 464 #self.errorhandler.logerror("Error attempting to read index - try reindexing: "+str(e)) 465 pass
466 #self.dir_lock.release() 467 468 469
470 -class PyLuceneHits(CommonIndexer.CommonEnquire):
471 """an enquire object contains the information about the result of a request 472 """ 473
474 - def get_matches(self, start, number):
475 """return a specified number of qualified matches of a previous query 476 477 @param start: index of the first match to return (starting from zero) 478 @type start: int 479 @param number: the number of matching entries to return 480 @type number: int 481 @return: a set of matching entries and some statistics 482 @rtype: tuple of (returned number, available number, matches) 483 "matches" is a dictionary of:: 484 ["rank", "percent", "document", "docid"] 485 """ 486 # check if requested results do not exist 487 # stop is the lowest index number to be ommitted 488 stop = start + number 489 if stop > self.enquire.length(): 490 stop = self.enquire.length() 491 # invalid request range 492 if stop <= start: 493 return (0, self.enquire.length(), []) 494 result = [] 495 for index in range(start, stop): 496 item = {} 497 item["rank"] = index 498 item["docid"] = self.enquire.id(index) 499 item["percent"] = self.enquire.score(index) 500 item["document"] = self.enquire.doc(index) 501 result.append(item) 502 return (stop-start, self.enquire.length(), result)
503
504 -def _occur(required, prohibited):
505 if required == True and prohibited == False: 506 return PyLucene.BooleanClause.Occur.MUST 507 elif required == False and prohibited == False: 508 return PyLucene.BooleanClause.Occur.SHOULD 509 elif required == False and prohibited == True: 510 return PyLucene.BooleanClause.Occur.MUST_NOT 511 else: 512 # It is an error to specify a clause as both required 513 # and prohibited 514 return None
515
516 -def _get_pylucene_version():
517 """get the installed pylucene version 518 519 @return: 1 -> PyLucene v1.x / 2 -> PyLucene v2.x / 0 -> unknown 520 @rtype: int 521 """ 522 version = PyLucene.VERSION 523 if version.startswith("1."): 524 return 1 525 elif version.startswith("2."): 526 return 2 527 else: 528 return 0
529 530
531 -def _escape_term_value(text):
532 return re.sub("\*", "", text)
533