Package translate :: Package search :: Module segment
[hide private]
[frames] | no frames]

Source Code for Module translate.search.segment

 1  #!/usr/bin/env python 
 2  # -*- coding: utf-8 -*- 
 3  # 
 4  # Copyright 2006 Zuza Software Foundation 
 5  # 
 6  # This file is part of translate. 
 7  # 
 8  # This program is free software; you can redistribute it and/or modify 
 9  # it under the terms of the GNU General Public License as published by 
10  # the Free Software Foundation; either version 2 of the License, or 
11  # (at your option) any later version. 
12  # 
13  # This program is distributed in the hope that it will be useful, 
14  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
15  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
16  # GNU General Public License for more details. 
17  # 
18  # You should have received a copy of the GNU General Public License 
19  # along with this program; if not, see <http://www.gnu.org/licenses/>. 
20   
21  """Module to deal with different types and uses of segmentation""" 
22   
23  #XXX: This module is now deprecated: Use language specific segmenters in the 
24  # lang package (character_iter, word_iter, sentence_iter, etc.). 
25   
26  punctuation = u".,;:!?-@#$%^*_()[]{}/\\'\"<>‘’‚‛“”„‟′″‴‵‶‷‹›«»±³¹²°¿©®×£¥" 
27   
28   
29 -def character_iter(text):
30 """Returns an iterator over the characters in text.""" 31 #We don't return more than one consecutive whitespace character 32 prev = 'A' 33 for c in text: 34 if c.isspace() and prev.isspace(): 35 continue 36 prev = c 37 if not (c in punctuation): 38 yield c.lower()
39 40
41 -def characters(text):
42 """Returns a list of characters in text.""" 43 return [c for c in character_iter(text)]
44 45
46 -def word_iter(text):
47 """Returns an iterator over the words in text.""" 48 #TODO: Consider replacing puctuation with space before split() 49 for w in text.split(): 50 yield w.strip(punctuation).lower()
51 52
53 -def words(text):
54 """Returns a list of words in text.""" 55 return [w for w in word_iter(text)]
56 57
58 -def sentence_iter(text):
59 """Returns an iterator over the senteces in text.""" 60 #TODO: This is very naïve. We really should consider all punctuation, 61 #and return the punctuation with the sentence. 62 #TODO: Search for capital letter start with next sentence to avoid 63 #confusion with abbreviations. And remember Afrikaans "'n" :-) 64 for s in text.split(". "): 65 yield s.strip()
66 67
68 -def sentences(text):
69 """Returns a list of senteces in text.""" 70 return [s for s in sentence_iter(text)]
71