Package translate :: Package search :: Module segment
[hide private]
[frames] | no frames]

Source Code for Module translate.search.segment

 1  #!/usr/bin/env python 
 2  # -*- coding: utf-8 -*- 
 3  # 
 4  # Copyright 2006 Zuza Software Foundation 
 5  # 
 6  # This file is part of translate. 
 7  # 
 8  # This program is free software; you can redistribute it and/or modify 
 9  # it under the terms of the GNU General Public License as published by 
10  # the Free Software Foundation; either version 2 of the License, or 
11  # (at your option) any later version. 
12  # 
13  # This program is distributed in the hope that it will be useful, 
14  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
15  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
16  # GNU General Public License for more details. 
17  # 
18  # You should have received a copy of the GNU General Public License 
19  # along with this program; if not, see <http://www.gnu.org/licenses/>. 
20   
21  """Module to deal with different types and uses of segmentation""" 
22   
23  #XXX: This module is now deprecated: Use language specific segmenters in the 
24  # lang package (character_iter, word_iter, sentence_iter, etc.). 
25   
26  punctuation = u".,;:!?-@#$%^*_()[]{}/\\'\"<>‘’‚‛“”„‟′″‴‵‶‷‹›«»±³¹²°¿©®×£¥" 
27   
28 -def character_iter(text):
29 """Returns an iterator over the characters in text.""" 30 #We don't return more than one consecutive whitespace character 31 prev = 'A' 32 for c in text: 33 if c.isspace() and prev.isspace(): 34 continue 35 prev = c 36 if not (c in punctuation): 37 yield c.lower()
38
39 -def characters(text):
40 """Returns a list of characters in text.""" 41 return [c for c in character_iter(text)]
42
43 -def word_iter(text):
44 """Returns an iterator over the words in text.""" 45 #TODO: Consider replacing puctuation with space before split() 46 for w in text.split(): 47 yield w.strip(punctuation).lower()
48
49 -def words(text):
50 """Returns a list of words in text.""" 51 return [w for w in word_iter(text)]
52
53 -def sentence_iter(text):
54 """Returns an iterator over the senteces in text.""" 55 #TODO: This is very naïve. We really should consider all punctuation, 56 #and return the punctuation with the sentence. 57 #TODO: Search for capital letter start with next sentence to avoid 58 #confusion with abbreviations. And remember Afrikaans "'n" :-) 59 for s in text.split(". "): 60 yield s.strip()
61
62 -def sentences(text):
63 """Returns a list of senteces in text.""" 64 return [s for s in sentence_iter(text)]
65