Package translate :: Package search :: Package indexing :: Module CommonIndexer
[hide private]
[frames] | no frames]

Source Code for Module translate.search.indexing.CommonIndexer

  1  # -*- coding: utf-8 -*- 
  2  # 
  3  # Copyright 2008 Zuza Software Foundation 
  4  #  
  5  # This file is part of translate. 
  6  # 
  7  # translate is free software; you can redistribute it and/or modify 
  8  # it under the terms of the GNU General Public License as published by 
  9  # the Free Software Foundation; either version 2 of the License, or 
 10  # (at your option) any later version. 
 11  #  
 12  # translate is distributed in the hope that it will be useful, 
 13  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 14  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 15  # GNU General Public License for more details. 
 16  # 
 17  # You should have received a copy of the GNU General Public License 
 18  # along with translate; if not, write to the Free Software 
 19  # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA 
 20  # 
 21   
 22   
 23  """ 
 24  base class for interfaces to indexing engines for pootle 
 25  """ 
 26   
 27  import translate.lang.data 
 28  import os 
 29   
 30  __revision__ = "$Id: CommonIndexer.py 8507 2008-09-27 09:15:08Z dwaynebailey $" 
 31   
 32   
33 -def is_available():
34 """check if this indexing engine interface is usable 35 36 this function must exist in every module that contains indexing engine 37 interfaces 38 39 @return: is this interface usable? 40 @rtype: bool 41 """ 42 return False
43 44
45 -class CommonDatabase(object):
46 """base class for indexing support 47 48 any real implementation must override most methods of this class 49 """ 50 51 field_analyzers = {} 52 """mapping of field names and analyzers - see 'set_field_analyzers'""" 53 54 ANALYZER_EXACT = 0 55 """exact matching: the query string must equal the whole term string""" 56 57 ANALYZER_PARTIAL = 1<<1 58 """partial matching: a document matches, even if the query string only 59 matches the beginning of the term value.""" 60 61 ANALYZER_TOKENIZE = 1<<2 62 """tokenize terms and queries automatically""" 63 64 ANALYZER_DEFAULT = ANALYZER_TOKENIZE | ANALYZER_PARTIAL 65 """the default analyzer to be used if nothing is configured""" 66 67 QUERY_TYPE = None 68 """override this with the query class of the implementation""" 69 70 INDEX_DIRECTORY_NAME = None 71 """override this with a string to be used as the name of the indexing 72 directory/file in the filesystem 73 """ 74
75 - def __init__(self, basedir, analyzer=None, create_allowed=True):
76 """initialize or open an indexing database 77 78 Any derived class must override __init__. 79 80 Any implementation can rely on the "self.location" attribute to be set 81 by the __init__ function of the super class. 82 83 @raise ValueError: the given location exists, but the database type 84 is incompatible (e.g. created by a different indexing engine) 85 @raise OSError: the database failed to initialize 86 87 @param basedir: the parent directory of the database 88 @type basedir: str 89 @param analyzer: bitwise combination of possible analyzer flags 90 to be used as the default analyzer for this database. Leave it empty 91 to use the system default analyzer (self.ANALYZER_DEFAULT). 92 see self.ANALYZER_TOKENIZE, self.ANALYZER_PARTIAL, ... 93 @type analyzer: int 94 @param create_allowed: create the database, if necessary; default: True 95 @type create_allowed: bool 96 """ 97 # just do some checks 98 if self.QUERY_TYPE is None: 99 raise NotImplementedError("Incomplete indexer implementation: " \ 100 + "'QUERY_TYPE' is undefined") 101 if self.INDEX_DIRECTORY_NAME is None: 102 raise NotImplementedError("Incomplete indexer implementation: " \ 103 + "'INDEX_DIRECTORY_NAME' is undefined") 104 self.location = os.path.join(basedir, self.INDEX_DIRECTORY_NAME) 105 if (not create_allowed) and (not os.path.exists(self.location)): 106 raise OSError("Indexer: the database does not exist - and I am" \ 107 + " not configured to create it.") 108 if analyzer is None: 109 self.analyzer = self.ANALYZER_DEFAULT 110 else: 111 self.analyzer = analyzer 112 self.field_analyzers = {}
113
114 - def flush(self, optimize=False):
115 """flush the content of the database - to force changes to be written 116 to disk 117 118 some databases also support index optimization 119 120 @param optimize: should the index be optimized if possible? 121 @type optimize: bool 122 """ 123 raise NotImplementedError("Incomplete indexer implementation: " \ 124 + "'flush' is missing")
125
126 - def make_query(self, args, require_all=True, analyzer=None):
127 """create simple queries (strings or field searches) or 128 combine multiple queries (AND/OR) 129 130 To specifiy rules for field searches, you may want to take a look at 131 'set_field_analyzers'. The parameter 'match_text_partial' can override 132 the previously defined default setting. 133 134 @param args: queries or search string or description of field query 135 examples:: 136 [xapian.Query("foo"), xapian.Query("bar")] 137 xapian.Query("foo") 138 "bar" 139 {"foo": "bar", "foobar": "foo"} 140 @type args: list of queries | single query | str | dict 141 @param require_all: boolean operator 142 (True -> AND (default) / False -> OR) 143 @type require_all: boolean 144 @param analyzer: (only applicable for 'dict' or 'str') 145 Define query options (partial matching, exact matching, tokenizing, 146 ...) as bitwise combinations of CommonIndexer.ANALYZER_???. 147 This can override previously defined field analyzer settings. 148 If analyzer is None (default), then the configured analyzer for the 149 field is used. 150 @type analyzer: int 151 @return: the combined query 152 @rtype: query type of the specific implemention 153 """ 154 # turn a dict into a list if necessary 155 if isinstance(args, dict): 156 args = args.items() 157 # turn 'args' into a list if necessary 158 if not isinstance(args, list): 159 args = [args] 160 # combine all given queries 161 result = [] 162 for query in args: 163 # just add precompiled queries 164 if isinstance(query, self.QUERY_TYPE): 165 result.append(self._create_query_for_query(query)) 166 # create field/value queries out of a tuple 167 elif isinstance(query, tuple): 168 field, value = query 169 # perform unicode normalization 170 field = translate.lang.data.normalize(unicode(field)) 171 value = translate.lang.data.normalize(unicode(value)) 172 # check for the choosen match type 173 if analyzer is None: 174 analyzer = self.get_field_analyzers(field) 175 result.append(self._create_query_for_field(field, value, 176 analyzer=analyzer)) 177 # parse plaintext queries 178 elif isinstance(query, basestring): 179 if analyzer is None: 180 analyzer = self.analyzer 181 # perform unicode normalization 182 query = translate.lang.data.normalize(unicode(query)) 183 result.append(self._create_query_for_string(query, 184 require_all=require_all, analyzer=analyzer)) 185 else: 186 # other types of queries are not supported 187 raise ValueError("Unable to handle query type: %s" \ 188 % str(type(query))) 189 # return the combined query 190 return self._create_query_combined(result, require_all)
191
192 - def _create_query_for_query(self, query):
193 """generate a query based on an existing query object 194 195 basically this function should just create a copy of the original 196 197 @param query: the original query object 198 @type query: xapian.Query 199 @return: the resulting query object 200 @rtype: xapian.Query | PyLucene.Query 201 """ 202 raise NotImplementedError("Incomplete indexer implementation: " \ 203 + "'_create_query_for_query' is missing")
204
205 - def _create_query_for_string(self, text, require_all=True, 206 analyzer=None):
207 """generate a query for a plain term of a string query 208 209 basically this function parses the string and returns the resulting 210 query 211 212 @param text: the query string 213 @type text: str 214 @param require_all: boolean operator 215 (True -> AND (default) / False -> OR) 216 @type require_all: bool 217 @param analyzer: Define query options (partial matching, exact matching, 218 tokenizing, ...) as bitwise combinations of 219 CommonIndexer.ANALYZER_???. 220 This can override previously defined field analyzer settings. 221 If analyzer is None (default), then the configured analyzer for the 222 field is used. 223 @type analyzer: int 224 @return: resulting query object 225 @rtype: xapian.Query | PyLucene.Query 226 """ 227 raise NotImplementedError("Incomplete indexer implementation: " \ 228 + "'_create_query_for_string' is missing")
229
230 - def _create_query_for_field(self, field, value, analyzer=None):
231 """generate a field query 232 233 this functions creates a field->value query 234 235 @param field: the fieldname to be used 236 @type field: str 237 @param value: the wanted value of the field 238 @type value: str 239 @param analyzer: Define query options (partial matching, exact matching, 240 tokenizing, ...) as bitwise combinations of 241 CommonIndexer.ANALYZER_???. 242 This can override previously defined field analyzer settings. 243 If analyzer is None (default), then the configured analyzer for the 244 field is used. 245 @type analyzer: int 246 @return: resulting query object 247 @rtype: xapian.Query | PyLucene.Query 248 """ 249 raise NotImplementedError("Incomplete indexer implementation: " \ 250 + "'_create_query_for_field' is missing")
251
252 - def _create_query_combined(self, queries, require_all=True):
253 """generate a combined query 254 255 @param queries: list of the original queries 256 @type queries: list of xapian.Query 257 @param require_all: boolean operator 258 (True -> AND (default) / False -> OR) 259 @type require_all: bool 260 @return: the resulting combined query object 261 @rtype: xapian.Query | PyLucene.Query 262 """ 263 raise NotImplementedError("Incomplete indexer implementation: " \ 264 + "'_create_query_combined' is missing")
265
266 - def index_document(self, data):
267 """add the given data to the database 268 269 @param data: the data to be indexed. 270 A dictionary will be treated as fieldname:value combinations. 271 If the fieldname is None then the value will be interpreted as a 272 plain term or as a list of plain terms. 273 Lists of terms are indexed separately. 274 Lists of strings are treated as plain terms. 275 @type data: dict | list of str 276 """ 277 doc = self._create_empty_document() 278 if isinstance(data, dict): 279 data = data.items() 280 # add all data 281 for dataset in data: 282 if isinstance(dataset, tuple): 283 # the dataset tuple consists of '(key, value)' 284 key, value = dataset 285 if key is None: 286 if isinstance(value, list): 287 terms = value[:] 288 elif isinstance(value, basestring): 289 terms = [value] 290 else: 291 raise ValueError("Invalid data type to be indexed: %s" \ 292 % str(type(data))) 293 for one_term in terms: 294 self._add_plain_term(doc, self._decode(one_term), 295 (self.ANALYZER_DEFAULT & self.ANALYZER_TOKENIZE > 0)) 296 else: 297 analyze_settings = self.get_field_analyzers(key) 298 # handle multiple terms 299 if not isinstance(value, list): 300 value = [value] 301 for one_term in value: 302 self._add_field_term(doc, key, self._decode(one_term), 303 (analyze_settings & self.ANALYZER_TOKENIZE > 0)) 304 elif isinstance(dataset, basestring): 305 self._add_plain_term(doc, self._decode(dataset), 306 (self.ANALYZER_DEFAULT & self.ANALYZER_TOKENIZE > 0)) 307 else: 308 raise ValueError("Invalid data type to be indexed: %s" \ 309 % str(type(data))) 310 self._add_document_to_index(doc)
311
312 - def _create_empty_document(self):
313 """create an empty document to be filled and added to the index later 314 315 @return: the new document object 316 @rtype: xapian.Document | PyLucene.Document 317 """ 318 raise NotImplementedError("Incomplete indexer implementation: " \ 319 + "'_create_empty_document' is missing")
320
321 - def _add_plain_term(self, document, term, tokenize=True):
322 """add a term to a document 323 324 @param document: the document to be changed 325 @type document: xapian.Document | PyLucene.Document 326 @param term: a single term to be added 327 @type term: str 328 @param tokenize: should the term be tokenized automatically 329 @type tokenize: bool 330 """ 331 raise NotImplementedError("Incomplete indexer implementation: " \ 332 + "'_add_plain_term' is missing")
333
334 - def _add_field_term(self, document, field, term, tokenize=True):
335 """add a field term to a document 336 337 @param document: the document to be changed 338 @type document: xapian.Document | PyLucene.Document 339 @param field: name of the field 340 @type field: str 341 @param term: term to be associated to the field 342 @type term: str 343 @param tokenize: should the term be tokenized automatically 344 @type tokenize: bool 345 """ 346 raise NotImplementedError("Incomplete indexer implementation: " \ 347 + "'_add_field_term' is missing")
348
349 - def _add_document_to_index(self, document):
350 """add a prepared document to the index database 351 352 @param document: the document to be added 353 @type document: xapian.Document | PyLucene.Document 354 """ 355 raise NotImplementedError("Incomplete indexer implementation: " \ 356 + "'_add_document_to_index' is missing")
357
358 - def begin_transaction(self):
359 """begin a transaction 360 361 You can group multiple modifications of a database as a transaction. 362 This prevents time-consuming database flushing and helps, if you want 363 that a changeset is committed either completely or not at all. 364 No changes will be written to disk until 'commit_transaction'. 365 'cancel_transaction' can be used to revert an ongoing transaction. 366 367 Database types that do not support transactions may silently ignore it. 368 """ 369 raise NotImplementedError("Incomplete indexer implementation: " \ 370 + "'begin_transaction' is missing")
371
372 - def cancel_transaction(self):
373 """cancel an ongoing transaction 374 375 See 'start_transaction' for details. 376 """ 377 raise NotImplementedError("Incomplete indexer implementation: " \ 378 + "'cancel_transaction' is missing")
379
380 - def commit_transaction(self):
381 """submit the currently ongoing transaction and write changes to disk 382 383 See 'start_transaction' for details. 384 """ 385 raise NotImplementedError("Incomplete indexer implementation: " \ 386 + "'commit_transaction' is missing")
387
388 - def get_query_result(self, query):
389 """return an object containing the results of a query 390 391 @param query: a pre-compiled query 392 @type query: a query object of the real implementation 393 @return: an object that allows access to the results 394 @rtype: subclass of CommonEnquire 395 """ 396 raise NotImplementedError("Incomplete indexer implementation: " \ 397 + "'get_query_result' is missing")
398
399 - def delete_document_by_id(self, docid):
400 """delete a specified document 401 402 @param docid: the document ID to be deleted 403 @type docid: int 404 """ 405 raise NotImplementedError("Incomplete indexer implementation: " \ 406 + "'delete_document_by_id' is missing")
407
408 - def search(self, query, fieldnames):
409 """return a list of the contents of specified fields for all matches of 410 a query 411 412 @param query: the query to be issued 413 @type query: a query object of the real implementation 414 @param fieldnames: the name(s) of a field of the document content 415 @type fieldnames: string | list of strings 416 @return: a list of dicts containing the specified field(s) 417 @rtype: list of dicts 418 """ 419 raise NotImplementedError("Incomplete indexer implementation: " \ 420 + "'search' is missing")
421
422 - def delete_doc(self, ident):
423 """delete the documents returned by a query 424 425 @param ident: [list of] document IDs | dict describing a query | query 426 @type ident: int | list of tuples | dict | list of dicts | 427 query (e.g. xapian.Query) | list of queries 428 """ 429 # turn a doc-ID into a list of doc-IDs 430 if isinstance(ident, list): 431 # it is already a list 432 ident_list = ident 433 else: 434 ident_list = [ident] 435 if len(ident_list) == 0: 436 # no matching items 437 return 0 438 if isinstance(ident_list[0], int): 439 # create a list of IDs of all successfully removed documents 440 success_delete = [match for match in ident_list 441 if self.delete_document_by_id(match)] 442 return len(success_delete) 443 if isinstance(ident_list[0], dict): 444 # something like: { "msgid": "foobar" } 445 # assemble all queries 446 query = self.make_query([self.make_query(query_dict, 447 require_all=True) for query_dict in ident_list], 448 require_all=True) 449 elif isinstance(ident_list[0], object): 450 # assume a query object (with 'AND') 451 query = self.make_query(ident_list, require_all=True) 452 else: 453 # invalid element type in list (not necessarily caught in the 454 # lines above) 455 raise TypeError("description of documents to-be-deleted is not " \ 456 + "supported: list of %s" % type(ident_list[0])) 457 # we successfully created a query - now iterate through the result 458 # no documents deleted so far ... 459 remove_list = [] 460 # delete all resulting documents step by step 461 def add_docid_to_list(match): 462 """collect every document ID""" 463 remove_list.append(match["docid"])
464 self._walk_matches(query, add_docid_to_list) 465 return self.delete_doc(remove_list)
466
467 - def _walk_matches(self, query, function, arg_for_function=None):
468 """use this function if you want to do something with every single match 469 of a query 470 471 example:: 472 self._walk_matches(query, function_for_match, arg_for_func) 473 'function_for_match' expects only one argument: the matched object 474 475 @param query: a query object of the real implementation 476 @type query: xapian.Query | PyLucene.Query 477 @param function: the function to execute with every match 478 @type function: function 479 @param arg_for_function: an optional argument for the function 480 @type arg_for_function: anything 481 """ 482 # execute the query 483 enquire = self.get_query_result(query) 484 # start with the first element 485 start = 0 486 # do the loop at least once 487 size, avail = (0, 1) 488 # how many results per 'get_matches'? 489 steps = 2 490 while start < avail: 491 (size, avail, matches) = enquire.get_matches(start, steps) 492 for match in matches: 493 if arg_for_function is None: 494 function(match) 495 else: 496 function(match, arg_for_function) 497 start += size
498
499 - def set_field_analyzers(self, field_analyzers):
500 """set the analyzers for different fields of the database documents 501 502 All bitwise combinations of CommonIndexer.ANALYZER_??? are possible. 503 504 @param field_analyzers: mapping of field names and analyzers 505 @type field_analyzers: dict containing field names and analyzers 506 @raise TypeError: invalid values in 'field_analyzers' 507 """ 508 for field, analyzer in field_analyzers.items(): 509 # check for invald input types 510 if not isinstance(field, (str, unicode)): 511 raise TypeError("field name must be a string") 512 if not isinstance(analyzer, int): 513 raise TypeError("the analyzer must be a whole number (int)") 514 # map the analyzer to the field name 515 self.field_analyzers[field] = analyzer
516
517 - def get_field_analyzers(self, fieldnames=None):
518 """return the analyzer that was mapped to a specific field 519 520 see 'set_field_analyzers' for details 521 522 @param fieldnames: the analyzer of this field (or all/multiple fields) 523 is requested; leave empty (or "None") to request all fields 524 @type fieldnames: str | list of str | None 525 @return: the analyzer setting of the field - see 526 CommonDatabase.ANALYZER_??? or a dict of field names and analyzers 527 @rtype: int | dict 528 """ 529 # all field analyzers are requested 530 if fieldnames is None: 531 # return a copy 532 return dict(self.field_analyzers) 533 # one field is requested 534 if isinstance(fieldnames, (str, unicode)): 535 if self.field_analyzers.has_key(fieldnames): 536 return self.field_analyzers[fieldnames] 537 else: 538 return self.analyzer 539 # a list of fields is requested 540 if isinstance(fieldnames, list): 541 result = {} 542 for field in fieldnames: 543 result[field] = self.get_field_analyzers(field) 544 return result 545 return self.analyzer
546
547 - def _decode(self, text):
548 """decode the string from utf-8 or charmap 549 perform unicde normalization 550 """ 551 if isinstance(text, str): 552 try: 553 result = unicode(text.decode("UTF-8")) 554 except UnicodeEncodeError, e: 555 result = unicode(text.decode("charmap")) 556 elif not isinstance(text, unicode): 557 result = unicode(text) 558 else: 559 result = text 560 # perform unicode normalization 561 return translate.lang.data.normalize(result)
562 563
564 -class CommonEnquire(object):
565 """an enquire object contains the information about the result of a request 566 """ 567
568 - def __init__(self, enquire):
569 """intialization of a wrapper around enquires of different backends 570 571 @param enquire: a previous enquire 572 @type enquire: xapian.Enquire | pylucene-enquire 573 """ 574 self.enquire = enquire
575
576 - def get_matches(self, start, number):
577 """return a specified number of qualified matches of a previous query 578 579 @param start: index of the first match to return (starting from zero) 580 @type start: int 581 @param number: the number of matching entries to return 582 @type number: int 583 @return: a set of matching entries and some statistics 584 @rtype: tuple of (returned number, available number, matches) 585 "matches" is a dictionary of:: 586 ["rank", "percent", "document", "docid"] 587 """ 588 raise NotImplementedError("Incomplete indexing implementation: " \ 589 + "'get_matches' for the 'Enquire' class is missing")
590
591 - def get_matches_count(self):
592 """return the estimated number of matches 593 594 use "CommonIndexer.search" to retrieve the exact number of matches 595 @return: the estimaed number of matches 596 @rtype: int 597 """ 598 (returned, estimate_count, matches) = self.get_matches(0, 1) 599 return estimate_count
600