Package translate :: Package misc :: Module quote
[hide private]
[frames] | no frames]

Source Code for Module translate.misc.quote

  1  #!/usr/bin/env python 
  2  # -*- coding: utf-8 -*- 
  3  # 
  4  # Copyright 2002-2006 Zuza Software Foundation 
  5  # 
  6  # This file is part of translate. 
  7  # 
  8  # translate is free software; you can redistribute it and/or modify 
  9  # it under the terms of the GNU General Public License as published by 
 10  # the Free Software Foundation; either version 2 of the License, or 
 11  # (at your option) any later version. 
 12  # 
 13  # translate is distributed in the hope that it will be useful, 
 14  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 15  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 16  # GNU General Public License for more details. 
 17  # 
 18  # You should have received a copy of the GNU General Public License 
 19  # along with translate; if not, write to the Free Software 
 20  # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA 
 21   
 22  """String processing utilities for extracting strings with various kinds 
 23  of delimiters""" 
 24   
 25  import logging 
 26  import htmlentitydefs 
 27   
 28  from translate.misc.typecheck import accepts 
29 30 31 -def find_all(searchin, substr):
32 """Returns a list of locations where substr occurs in searchin 33 locations are not allowed to overlap""" 34 location = 0 35 locations = [] 36 while location != -1: 37 location = searchin.find(substr, location) 38 if location != -1: 39 locations.append(location) 40 location += len(substr) 41 return locations
42
43 44 -def extract(source, startdelim, enddelim, 45 escape=None, startinstring=False, allowreentry=True):
46 """Extracts a doublequote-delimited string from a string, allowing for 47 backslash-escaping returns tuple of (quoted string with quotes, still in 48 string at end). 49 """ 50 # Note that this returns the quote characters as well... even internally 51 instring = startinstring 52 enteredonce = False 53 lenstart = len(startdelim) 54 lenend = len(enddelim) 55 startdelim_places = find_all(source, startdelim) 56 if startdelim == enddelim: 57 enddelim_places = startdelim_places[:] 58 else: 59 enddelim_places = find_all(source, enddelim) 60 if escape is not None: 61 lenescape = len(escape) 62 escape_places = find_all(source, escape) 63 # Filter escaped escapes 64 true_escape = False 65 true_escape_places = [] 66 for escape_pos in escape_places: 67 if escape_pos - lenescape in escape_places: 68 true_escape = not true_escape 69 else: 70 true_escape = True 71 if true_escape: 72 true_escape_places.append(escape_pos) 73 startdelim_places = [pos for pos in startdelim_places if pos - lenescape not in true_escape_places] 74 enddelim_places = [pos + lenend for pos in enddelim_places if pos - lenescape not in true_escape_places] 75 else: 76 enddelim_places = [pos + lenend for pos in enddelim_places] 77 # Get a unique sorted list of the significant places in the string 78 significant_places = [0] + startdelim_places + enddelim_places + [len(source)-1] 79 significant_places.sort() 80 extracted = "" 81 lastpos = None 82 for pos in significant_places: 83 if instring and pos in enddelim_places: 84 # Make sure that if startdelim == enddelim we don't get confused 85 # and count the same string as start and end. 86 if lastpos == pos - lenstart and lastpos in startdelim_places: 87 continue 88 extracted += source[lastpos:pos] 89 instring = False 90 lastpos = pos 91 if (not instring) and pos in startdelim_places and not (enteredonce and not allowreentry): 92 instring = True 93 enteredonce = True 94 lastpos = pos 95 if instring: 96 extracted += source[lastpos:] 97 return (extracted, instring)
98
99 100 -def extractwithoutquotes(source, startdelim, enddelim, escape=None, 101 startinstring=False, includeescapes=True, 102 allowreentry=True):
103 """Extracts a doublequote-delimited string from a string, allowing for 104 backslash-escaping includeescapes can also be a function that takes the 105 whole escaped string and returns the replaced version. 106 """ 107 instring = startinstring 108 enteredonce = False 109 lenstart = len(startdelim) 110 lenend = len(enddelim) 111 startdelim_places = find_all(source, startdelim) 112 if startdelim == enddelim: 113 enddelim_places = startdelim_places[:] 114 else: 115 enddelim_places = find_all(source, enddelim) 116 #hell slow because it is called far too often 117 if escape is not None: 118 lenescape = len(escape) 119 escape_places = find_all(source, escape) 120 # filter escaped escapes 121 true_escape = False 122 true_escape_places = [] 123 for escape_pos in escape_places: 124 if escape_pos - lenescape in escape_places: 125 true_escape = not true_escape 126 else: 127 true_escape = True 128 if true_escape: 129 true_escape_places.append(escape_pos) 130 startdelim_places = [pos for pos in startdelim_places if pos - lenescape not in true_escape_places] 131 enddelim_places = [pos + lenend for pos in enddelim_places if pos - lenescape not in true_escape_places] 132 else: 133 enddelim_places = [pos + lenend for pos in enddelim_places] 134 # get a unique sorted list of the significant places in the string 135 significant_places = [0] + startdelim_places + enddelim_places + [len(source)-1] 136 significant_places.sort() 137 extracted = "" 138 lastpos = 0 139 callable_includeescapes = callable(includeescapes) 140 checkescapes = callable_includeescapes or not includeescapes 141 for pos in significant_places: 142 if instring and pos in enddelim_places and lastpos != pos - lenstart: 143 section_start, section_end = lastpos + len(startdelim), pos - len(enddelim) 144 section = source[section_start:section_end] 145 if escape is not None and checkescapes: 146 escape_list = [epos - section_start for epos in true_escape_places if section_start <= epos <= section_end] 147 new_section = "" 148 last_epos = 0 149 for epos in escape_list: 150 new_section += section[last_epos:epos] 151 if callable_includeescapes: 152 replace_escape = includeescapes(section[epos:epos + lenescape + 1]) 153 # TODO: deprecate old method of returning boolean from 154 # includeescape, by removing this if block 155 if not isinstance(replace_escape, basestring): 156 if replace_escape: 157 replace_escape = section[epos:epos + lenescape + 1] 158 else: 159 replace_escape = section[epos + lenescape:epos + lenescape + 1] 160 new_section += replace_escape 161 last_epos = epos + lenescape + 1 162 else: 163 last_epos = epos + lenescape 164 section = new_section + section[last_epos:] 165 extracted += section 166 instring = False 167 lastpos = pos 168 if (not instring) and pos in startdelim_places and not (enteredonce and not allowreentry): 169 instring = True 170 enteredonce = True 171 lastpos = pos 172 if instring: 173 section_start = lastpos + len(startdelim) 174 section = source[section_start:] 175 if escape is not None and not includeescapes: 176 escape_list = [epos - section_start for epos in true_escape_places if section_start <= epos] 177 new_section = "" 178 last_epos = 0 179 for epos in escape_list: 180 new_section += section[last_epos:epos] 181 if callable_includeescapes and includeescapes(section[epos:epos + lenescape + 1]): 182 last_epos = epos 183 else: 184 last_epos = epos + lenescape 185 section = new_section + section[last_epos:] 186 extracted += section 187 return (extracted, instring)
188
189 190 -def escapequotes(source, escapeescapes=0):
191 "Returns the same string, with double quotes escaped with backslash" 192 if escapeescapes: 193 return source.replace('\\', '\\\\').replace('"', '\\"') 194 else: 195 return source.replace('"', '\\"')
196
197 198 -def escapesinglequotes(source):
199 "Returns the same string, with single quotes doubled" 200 return source.replace("'", "''")
201
202 203 -def htmlentityencode(source):
204 """encodes source using HTML entities e.g. © -> &copy;""" 205 output = "" 206 for char in source: 207 charnum = ord(char) 208 if charnum in htmlentitydefs.codepoint2name: 209 output += "&%s;" % htmlentitydefs.codepoint2name[charnum] 210 else: 211 output += str(char) 212 return output
213
214 215 -def htmlentitydecode(source):
216 """decodes source using HTML entities e.g. &copy; -> ©""" 217 output = u"" 218 inentity = False 219 for char in source: 220 if char == "&": 221 inentity = True 222 possibleentity = "" 223 continue 224 if inentity: 225 if char == ";": 226 if len(possibleentity) > 0 and possibleentity in htmlentitydefs.name2codepoint: 227 output += unichr(htmlentitydefs.name2codepoint[possibleentity]) 228 inentity = False 229 else: 230 output += "&" + possibleentity + ";" 231 inentity = False 232 elif char == " ": 233 output += "&" + possibleentity + char 234 inentity = False 235 else: 236 possibleentity += char 237 else: 238 output += char 239 return output
240
241 242 -def javapropertiesencode(source):
243 """Encodes source in the escaped-unicode encoding used by Java 244 .properties files 245 """ 246 output = u"" 247 for char in source: 248 charnum = ord(char) 249 if char in controlchars: 250 output += controlchars[char] 251 elif 0 <= charnum < 128: 252 output += str(char) 253 else: 254 output += u"\\u%04X" % charnum 255 return output
256
257 258 -def mozillapropertiesencode(source):
259 """Encodes source in the escaped-unicode encoding used by Mozilla 260 .properties files. 261 """ 262 output = u"" 263 for char in source: 264 if char in controlchars: 265 output += controlchars[char] 266 else: 267 output += char 268 return output
269 270 propertyescapes = { 271 # escapes that are self-escaping 272 "\\": "\\", "'": "'", '"': '"', 273 # control characters that we keep 274 "f": "\f", "n": "\n", "r": "\r", "t": "\t", 275 } 276 277 controlchars = { 278 # the reverse of the above... 279 "\\": "\\\\", 280 "\f": "\\f", "\n": "\\n", "\r": "\\r", "\t": "\\t" 281 }
282 283 284 -def escapecontrols(source):
285 """escape control characters in the given string""" 286 for key, value in controlchars.iteritems(): 287 source = source.replace(key, value) 288 return source
289
290 291 @accepts(unicode) 292 -def propertiesdecode(source):
293 """Decodes source from the escaped-unicode encoding used by .properties 294 files. 295 296 Java uses Latin1 by default, and Mozilla uses UTF-8 by default. 297 298 Since the .decode("unicode-escape") routine decodes everything, and we 299 don't want to we reimplemented the algorithm from Python Objects/unicode.c 300 in Python and modify it to retain escaped control characters. 301 """ 302 output = u"" 303 s = 0 304 if isinstance(source, str): 305 source = source.decode(encoding) 306 307 def unichr2(i): 308 """Returns a Unicode string of one character with ordinal 32 <= i, 309 otherwise an escaped control character. 310 """ 311 if 32 <= i: 312 return unichr(i) 313 elif unichr(i) in controlchars: 314 # we just return the character, unescaped 315 # if people want to escape them they can use escapecontrols 316 return unichr(i) 317 else: 318 return "\\u%04x" % i
319 320 while s < len(source): 321 c = source[s] 322 if c != '\\': 323 output += c 324 s += 1 325 continue 326 s += 1 327 if s >= len(source): 328 # this is an escape at the end of the line, which implies 329 # a continuation..., return the escape to inform the parser 330 output += c 331 continue 332 c = source[s] 333 s += 1 334 if c == '\n': 335 pass 336 # propertyescapes lookups 337 elif c in propertyescapes: 338 output += propertyescapes[c] 339 # \uXXXX escapes 340 # \UXXXX escapes 341 elif c in "uU": 342 digits = 4 343 x = 0 344 for digit in range(digits): 345 x <<= 4 346 if s + digit >= len(source): 347 digits = digit 348 break 349 c = source[s + digit].lower() 350 if c.isdigit(): 351 x += ord(c) - ord('0') 352 elif c in "abcdef": 353 x += ord(c) - ord('a') + 10 354 else: 355 break 356 s += digits 357 output += unichr2(x) 358 elif c == "N": 359 if source[s] != "{": 360 logging.warn("Invalid named unicode escape: no { after \\N") 361 output += "\\" + c 362 continue 363 s += 1 364 e = source.find("}", s) 365 if e == -1: 366 logging.warn("Invalid named unicode escape: no } after \\N{") 367 output += "\\" + c 368 continue 369 import unicodedata 370 name = source[s:e] 371 output += unicodedata.lookup(name) 372 s = e + 1 373 else: 374 output += c # Drop any \ that we don't specifically handle 375 return output 376
377 378 -def quotestr(source, escapeescapes=0):
379 """Returns a doublequote-delimited quoted string, escaping double 380 quotes with backslash. 381 """ 382 if isinstance(source, list): 383 firstline = True 384 for line in source: 385 if firstline: 386 newsource = '"' + escapequotes(line, escapeescapes) + '"' 387 firstline = False 388 else: 389 newsource = newsource + '\n' + \ 390 '"' + escapequotes(line, escapeescapes) + '"' 391 return newsource 392 else: 393 return '"' + escapequotes(source, escapeescapes) + '"'
394
395 396 -def singlequotestr(source):
397 """Returns a doublequote-delimited quoted string, escaping single quotes 398 with themselves. 399 """ 400 return "'" + escapesinglequotes(source) + "'"
401
402 403 -def findend(string, substring):
404 s = string.find(substring) 405 if s != -1: 406 s += len(substring) 407 return s
408
409 410 -def rstripeol(string):
411 return string.rstrip("\r\n")
412
413 414 -def stripcomment(comment, startstring="<!--", endstring="-->"):
415 cstart = comment.find(startstring) 416 if cstart == -1: 417 cstart = 0 418 else: 419 cstart += len(startstring) 420 cend = comment.find(endstring, cstart) 421 return comment[cstart:cend].strip()
422
423 424 -def unstripcomment(comment, startstring="<!-- ", endstring=" -->\n"):
425 return startstring + comment.strip() + endstring
426