1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23 """
24 interface for the PyLucene (v2.x) indexing engine
25
26 take a look at PyLuceneIndexer1.py for the PyLucene v1.x interface
27 """
28
29 __revision__ = "$Id: PyLuceneIndexer.py 11946 2009-07-19 23:59:10Z alaaosh $"
30
31 import CommonIndexer
32
33
34 import tempfile
35 import re
36 import os
37 import time
38
39
40
41
42 try:
43 import PyLucene
44 _COMPILER = 'gcj'
45 except ImportError:
46
47 import lucene
48 PyLucene = lucene
49 PyLucene.initVM(PyLucene.CLASSPATH)
50 _COMPILER = 'jcc'
51
52
53 UNNAMED_FIELD_NAME = "FieldWithoutAName"
54 MAX_FIELD_SIZE = 1048576
55
56
59
60
62 """manage and use a pylucene indexing database"""
63
64 QUERY_TYPE = PyLucene.Query
65 INDEX_DIRECTORY_NAME = "lucene"
66
67 - def __init__(self, basedir, analyzer=None, create_allowed=True):
68 """initialize or open an indexing database
69
70 Any derived class must override __init__.
71
72 @raise ValueError: the given location exists, but the database type
73 is incompatible (e.g. created by a different indexing engine)
74 @raise OSError: the database failed to initialize
75
76 @param basedir: the parent directory of the database
77 @type basedir: str
78 @param analyzer: bitwise combination of possible analyzer flags
79 to be used as the default analyzer for this database. Leave it empty
80 to use the system default analyzer (self.ANALYZER_DEFAULT).
81 see self.ANALYZER_TOKENIZE, self.ANALYZER_PARTIAL, ...
82 @type analyzer: int
83 @param create_allowed: create the database, if necessary; default: True
84 @type create_allowed: bool
85 """
86 super(PyLuceneDatabase, self).__init__(basedir, analyzer=analyzer,
87 create_allowed=create_allowed)
88 self.pyl_analyzer = PyLucene.StandardAnalyzer()
89 self.writer = None
90 self.reader = None
91 self.index_version = None
92 try:
93
94 tempreader = PyLucene.IndexReader.open(self.location)
95 tempreader.close()
96 except PyLucene.JavaError, err_msg:
97
98
99
100
101
102 if not create_allowed:
103 raise OSError("Indexer: skipping database creation")
104 try:
105
106 parent_path = os.path.dirname(self.location)
107 if not os.path.isdir(parent_path):
108
109 os.makedirs(parent_path)
110 except IOError, err_msg:
111 raise OSError("Indexer: failed to create the parent " \
112 + "directory (%s) of the indexing database: %s" \
113 % (parent_path, err_msg))
114 try:
115 tempwriter = PyLucene.IndexWriter(self.location,
116 self.pyl_analyzer, True)
117 tempwriter.close()
118 except PyLucene.JavaError, err_msg:
119 raise OSError("Indexer: failed to open or create a Lucene" \
120 + " database (%s): %s" % (self.location, err_msg))
121
122
123 lockname = os.path.join(tempfile.gettempdir(),
124 re.sub("\W", "_", self.location))
125
126
127 numtries = 0
128
129
130 try:
131 while numtries < 10:
132 try:
133 self.reader = PyLucene.IndexReader.open(self.location)
134 self.indexVersion = self.reader.getCurrentVersion(
135 self.location)
136 self.searcher = PyLucene.IndexSearcher(self.reader)
137 break
138 except PyLucene.JavaError, e:
139
140 lock_error_msg = e
141 time.sleep(0.01)
142 numtries += 1
143 else:
144
145 raise OSError("Indexer: failed to lock index database" \
146 + " (%s)" % lock_error_msg)
147 finally:
148 pass
149
150
151 self._index_refresh()
152
154 """remove lock and close writer after loosing the last reference"""
155 self._writer_close()
156
157 - def flush(self, optimize=False):
158 """flush the content of the database - to force changes to be written
159 to disk
160
161 some databases also support index optimization
162
163 @param optimize: should the index be optimized if possible?
164 @type optimize: bool
165 """
166 if self._writer_is_open():
167 try:
168 if optimize:
169 self.writer.optimize()
170 finally:
171
172 self._writer_close()
173
174 self._index_refresh()
175
177 """generate a query based on an existing query object
178
179 basically this function should just create a copy of the original
180
181 @param query: the original query object
182 @type query: PyLucene.Query
183 @return: resulting query object
184 @rtype: PyLucene.Query
185 """
186
187
188 return query
189
192 """generate a query for a plain term of a string query
193
194 basically this function parses the string and returns the resulting
195 query
196
197 @param text: the query string
198 @type text: str
199 @param require_all: boolean operator
200 (True -> AND (default) / False -> OR)
201 @type require_all: bool
202 @param analyzer: the analyzer to be used
203 possible analyzers are:
204 - L{CommonDatabase.ANALYZER_TOKENIZE}
205 the field value is splitted to be matched word-wise
206 - L{CommonDatabase.ANALYZER_PARTIAL}
207 the field value must start with the query string
208 - L{CommonDatabase.ANALYZER_EXACT}
209 keep special characters and the like
210 @type analyzer: bool
211 @return: resulting query object
212 @rtype: PyLucene.Query
213 """
214 if analyzer is None:
215 analyzer = self.analyzer
216 if analyzer == self.ANALYZER_EXACT:
217 analyzer_obj = PyLucene.KeywordAnalyzer()
218 else:
219 text = _escape_term_value(text)
220 analyzer_obj = PyLucene.StandardAnalyzer()
221 qp = PyLucene.QueryParser(UNNAMED_FIELD_NAME, analyzer_obj)
222 if (analyzer & self.ANALYZER_PARTIAL > 0):
223
224 text += "*"
225 if require_all:
226 qp.setDefaultOperator(qp.Operator.AND)
227 else:
228 qp.setDefaultOperator(qp.Operator.OR)
229 return qp.parse(text)
230
232 """generate a field query
233
234 this functions creates a field->value query
235
236 @param field: the fieldname to be used
237 @type field: str
238 @param value: the wanted value of the field
239 @type value: str
240 @param analyzer: the analyzer to be used
241 possible analyzers are:
242 - L{CommonDatabase.ANALYZER_TOKENIZE}
243 the field value is splitted to be matched word-wise
244 - L{CommonDatabase.ANALYZER_PARTIAL}
245 the field value must start with the query string
246 - L{CommonDatabase.ANALYZER_EXACT}
247 keep special characters and the like
248 @type analyzer: bool
249 @return: resulting query object
250 @rtype: PyLucene.Query
251 """
252 if analyzer is None:
253 analyzer = self.analyzer
254 if analyzer == self.ANALYZER_EXACT:
255 analyzer_obj = PyLucene.KeywordAnalyzer()
256 else:
257 value = _escape_term_value(value)
258 analyzer_obj = PyLucene.StandardAnalyzer()
259 qp = PyLucene.QueryParser(field, analyzer_obj)
260 if (analyzer & self.ANALYZER_PARTIAL > 0):
261
262 value += "*"
263 return qp.parse(value)
264
266 """generate a combined query
267
268 @param queries: list of the original queries
269 @type queries: list of PyLucene.Query
270 @param require_all: boolean operator
271 (True -> AND (default) / False -> OR)
272 @type require_all: bool
273 @return: the resulting combined query object
274 @rtype: PyLucene.Query
275 """
276 combined_query = PyLucene.BooleanQuery()
277 for query in queries:
278 combined_query.add(
279 PyLucene.BooleanClause(query, _occur(require_all, False)))
280 return combined_query
281
283 """create an empty document to be filled and added to the index later
284
285 @return: the new document object
286 @rtype: PyLucene.Document
287 """
288 return PyLucene.Document()
289
291 """add a term to a document
292
293 @param document: the document to be changed
294 @type document: PyLucene.Document
295 @param term: a single term to be added
296 @type term: str
297 @param tokenize: should the term be tokenized automatically
298 @type tokenize: bool
299 """
300 if tokenize:
301 token_flag = PyLucene.Field.Index.TOKENIZED
302 else:
303 token_flag = PyLucene.Field.Index.UN_TOKENIZED
304 document.add(PyLucene.Field(str(UNNAMED_FIELD_NAME), term,
305 PyLucene.Field.Store.YES, token_flag))
306
308 """add a field term to a document
309
310 @param document: the document to be changed
311 @type document: PyLucene.Document
312 @param field: name of the field
313 @type field: str
314 @param term: term to be associated to the field
315 @type term: str
316 @param tokenize: should the term be tokenized automatically
317 @type tokenize: bool
318 """
319 if tokenize:
320 token_flag = PyLucene.Field.Index.TOKENIZED
321 else:
322 token_flag = PyLucene.Field.Index.UN_TOKENIZED
323 document.add(PyLucene.Field(str(field), term,
324 PyLucene.Field.Store.YES, token_flag))
325
327 """add a prepared document to the index database
328
329 @param document: the document to be added
330 @type document: PyLucene.Document
331 """
332 self._writer_open()
333 self.writer.addDocument(document)
334
336 """PyLucene does not support transactions
337
338 Thus this function just opens the database for write access.
339 Call "cancel_transaction" or "commit_transaction" to close write
340 access in order to remove the exclusive lock from the database
341 directory.
342 """
343 self._writer_open()
344
346 """PyLucene does not support transactions
347
348 Thus this function just closes the database write access and removes
349 the exclusive lock.
350
351 See 'start_transaction' for details.
352 """
353 self._writer_close()
354
356 """PyLucene does not support transactions
357
358 Thus this function just closes the database write access and removes
359 the exclusive lock.
360
361 See 'start_transaction' for details.
362 """
363 self._writer_close()
364 self._index_refresh()
365
367 """return an object containing the results of a query
368
369 @param query: a pre-compiled query
370 @type query: a query object of the real implementation
371 @return: an object that allows access to the results
372 @rtype: subclass of CommonEnquire
373 """
374 return PyLuceneHits(self.searcher.search(query))
375
377 """delete a specified document
378
379 @param docid: the document ID to be deleted
380 @type docid: int
381 """
382 self.reader.deleteDocument(docid)
383
384 self._index_refresh()
385
386 - def search(self, query, fieldnames):
387 """return a list of the contents of specified fields for all matches of
388 a query
389
390 @param query: the query to be issued
391 @type query: a query object of the real implementation
392 @param fieldnames: the name(s) of a field of the document content
393 @type fieldnames: string | list of strings
394 @return: a list of dicts containing the specified field(s)
395 @rtype: list of dicts
396 """
397 if isinstance(fieldnames, basestring):
398 fieldnames = [fieldnames]
399 hits = self.searcher.search(query)
400 if _COMPILER == 'jcc':
401
402 hits = [(hit, hits.doc(hit)) for hit in range(hits.length())]
403 result = []
404 for hit, doc in hits:
405 fields = {}
406 for fieldname in fieldnames:
407
408 if fieldname is None:
409 pyl_fieldname = UNNAMED_FIELD_NAME
410 else:
411 pyl_fieldname = fieldname
412 fields[fieldname] = doc.getValues(pyl_fieldname)
413 result.append(fields)
414 return result
415
417 """open write access for the indexing database and acquire an
418 exclusive lock
419 """
420 if not self._writer_is_open():
421
422 self.writer = PyLucene.IndexWriter(self.location, self.pyl_analyzer,
423 False)
424
425
426
427 if hasattr(self.writer, "setMaxFieldLength"):
428 self.writer.setMaxFieldLength(MAX_FIELD_SIZE)
429
430
432 """close indexing write access and remove the database lock"""
433 if self._writer_is_open():
434 self.writer.close()
435 self.writer = None
436
437
438
440 """check if the indexing write access is currently open"""
441 return not self.writer is None
442
444 """re-read the indexer database"""
445
446
447
448
449
450
451 try:
452 if self.reader is None or self.searcher is None:
453 self.reader = PyLucene.IndexReader.open(self.location)
454 self.searcher = PyLucene.IndexSearcher(self.reader)
455 elif self.index_version != self.reader.getCurrentVersion( \
456 self.location):
457 self.searcher.close()
458 self.reader.close()
459 self.reader = PyLucene.IndexReader.open(self.location)
460 self.searcher = PyLucene.IndexSearcher(self.reader)
461 self.index_version = self.reader.getCurrentVersion(self.location)
462 except PyLucene.JavaError,e:
463
464
465 pass
466
467
468
469
471 """an enquire object contains the information about the result of a request
472 """
473
475 """return a specified number of qualified matches of a previous query
476
477 @param start: index of the first match to return (starting from zero)
478 @type start: int
479 @param number: the number of matching entries to return
480 @type number: int
481 @return: a set of matching entries and some statistics
482 @rtype: tuple of (returned number, available number, matches)
483 "matches" is a dictionary of::
484 ["rank", "percent", "document", "docid"]
485 """
486
487
488 stop = start + number
489 if stop > self.enquire.length():
490 stop = self.enquire.length()
491
492 if stop <= start:
493 return (0, self.enquire.length(), [])
494 result = []
495 for index in range(start, stop):
496 item = {}
497 item["rank"] = index
498 item["docid"] = self.enquire.id(index)
499 item["percent"] = self.enquire.score(index)
500 item["document"] = self.enquire.doc(index)
501 result.append(item)
502 return (stop-start, self.enquire.length(), result)
503
504 -def _occur(required, prohibited):
505 if required == True and prohibited == False:
506 return PyLucene.BooleanClause.Occur.MUST
507 elif required == False and prohibited == False:
508 return PyLucene.BooleanClause.Occur.SHOULD
509 elif required == False and prohibited == True:
510 return PyLucene.BooleanClause.Occur.MUST_NOT
511 else:
512
513
514 return None
515
517 """get the installed pylucene version
518
519 @return: 1 -> PyLucene v1.x / 2 -> PyLucene v2.x / 0 -> unknown
520 @rtype: int
521 """
522 version = PyLucene.VERSION
523 if version.startswith("1."):
524 return 1
525 elif version.startswith("2."):
526 return 2
527 else:
528 return 0
529
530
533