1
- import codecs
2
- from codecs import BOM_UTF8
3
1
import os
4
- import re
5
2
import shlex
6
3
import sys
7
4
import tempfile
5
+ import tokenize
8
6
9
7
import tkinter .filedialog as tkFileDialog
10
8
import tkinter .messagebox as tkMessageBox
20
18
errors = 'surrogateescape'
21
19
22
20
23
- coding_re = re .compile (r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)' , re .ASCII )
24
- blank_re = re .compile (r'^[ \t\f]*(?:[#\r\n]|$)' , re .ASCII )
25
-
26
- def coding_spec (data ):
27
- """Return the encoding declaration according to PEP 263.
28
-
29
- When checking encoded data, only the first two lines should be passed
30
- in to avoid a UnicodeDecodeError if the rest of the data is not unicode.
31
- The first two lines would contain the encoding specification.
32
-
33
- Raise a LookupError if the encoding is declared but unknown.
34
- """
35
- if isinstance (data , bytes ):
36
- # This encoding might be wrong. However, the coding
37
- # spec must be ASCII-only, so any non-ASCII characters
38
- # around here will be ignored. Decoding to Latin-1 should
39
- # never fail (except for memory outage)
40
- lines = data .decode ('iso-8859-1' )
41
- else :
42
- lines = data
43
- # consider only the first two lines
44
- if '\n ' in lines :
45
- lst = lines .split ('\n ' , 2 )[:2 ]
46
- elif '\r ' in lines :
47
- lst = lines .split ('\r ' , 2 )[:2 ]
48
- else :
49
- lst = [lines ]
50
- for line in lst :
51
- match = coding_re .match (line )
52
- if match is not None :
53
- break
54
- if not blank_re .match (line ):
55
- return None
56
- else :
57
- return None
58
- name = match .group (1 )
59
- try :
60
- codecs .lookup (name )
61
- except LookupError :
62
- # The standard encoding error does not indicate the encoding
63
- raise LookupError ("Unknown encoding: " + name )
64
- return name
65
-
66
21
67
22
class IOBinding :
68
23
# One instance per editor Window so methods know which to save, close.
@@ -78,7 +33,7 @@ def __init__(self, editwin):
78
33
self .save_as )
79
34
self .__id_savecopy = self .text .bind ("<<save-copy-of-window-as-file>>" ,
80
35
self .save_a_copy )
81
- self .fileencoding = None
36
+ self .fileencoding = 'utf-8'
82
37
self .__id_print = self .text .bind ("<<print-window>>" , self .print_window )
83
38
84
39
def close (self ):
@@ -165,34 +120,44 @@ def open(self, event=None, editFile=None):
165
120
self .text .focus_set ()
166
121
return "break"
167
122
168
- eol = r"(\r\n)|\n|\r" # \r\n (Windows), \n (UNIX), or \r (Mac)
169
- eol_re = re .compile (eol )
170
123
eol_convention = os .linesep # default
171
124
172
125
def loadfile (self , filename ):
173
126
try :
174
- # open the file in binary mode so that we can handle
175
- # end-of-line convention ourselves.
176
- with open (filename , 'rb' ) as f :
177
- two_lines = f .readline () + f .readline ()
178
- f .seek (0 )
179
- bytes = f .read ()
180
- except OSError as msg :
181
- tkMessageBox .showerror ("I/O Error" , str (msg ), parent = self .text )
127
+ try :
128
+ with tokenize .open (filename ) as f :
129
+ chars = f .read ()
130
+ fileencoding = f .encoding
131
+ eol_convention = f .newlines
132
+ converted = False
133
+ except (UnicodeDecodeError , SyntaxError ):
134
+ # Wait for the editor window to appear
135
+ self .editwin .text .update ()
136
+ enc = askstring (
137
+ "Specify file encoding" ,
138
+ "The file's encoding is invalid for Python 3.x.\n "
139
+ "IDLE will convert it to UTF-8.\n "
140
+ "What is the current encoding of the file?" ,
141
+ initialvalue = 'utf-8' ,
142
+ parent = self .editwin .text )
143
+ with open (filename , encoding = enc ) as f :
144
+ chars = f .read ()
145
+ fileencoding = f .encoding
146
+ eol_convention = f .newlines
147
+ converted = True
148
+ except OSError as err :
149
+ tkMessageBox .showerror ("I/O Error" , str (err ), parent = self .text )
182
150
return False
183
- chars , converted = self ._decode (two_lines , bytes )
184
- if chars is None :
151
+ except UnicodeDecodeError :
185
152
tkMessageBox .showerror ("Decoding Error" ,
186
153
"File %s\n Failed to Decode" % filename ,
187
154
parent = self .text )
188
155
return False
189
- # We now convert all end-of-lines to '\n's
190
- firsteol = self .eol_re .search (chars )
191
- if firsteol :
192
- self .eol_convention = firsteol .group (0 )
193
- chars = self .eol_re .sub (r"\n" , chars )
156
+
194
157
self .text .delete ("1.0" , "end" )
195
158
self .set_filename (None )
159
+ self .fileencoding = fileencoding
160
+ self .eol_convention = eol_convention
196
161
self .text .insert ("1.0" , chars )
197
162
self .reset_undo ()
198
163
self .set_filename (filename )
@@ -205,74 +170,6 @@ def loadfile(self, filename):
205
170
self .updaterecentfileslist (filename )
206
171
return True
207
172
208
- def _decode (self , two_lines , bytes ):
209
- "Create a Unicode string."
210
- chars = None
211
- # Check presence of a UTF-8 signature first
212
- if bytes .startswith (BOM_UTF8 ):
213
- try :
214
- chars = bytes [3 :].decode ("utf-8" )
215
- except UnicodeDecodeError :
216
- # has UTF-8 signature, but fails to decode...
217
- return None , False
218
- else :
219
- # Indicates that this file originally had a BOM
220
- self .fileencoding = 'BOM'
221
- return chars , False
222
- # Next look for coding specification
223
- try :
224
- enc = coding_spec (two_lines )
225
- except LookupError as name :
226
- tkMessageBox .showerror (
227
- title = "Error loading the file" ,
228
- message = "The encoding '%s' is not known to this Python " \
229
- "installation. The file may not display correctly" % name ,
230
- parent = self .text )
231
- enc = None
232
- except UnicodeDecodeError :
233
- return None , False
234
- if enc :
235
- try :
236
- chars = str (bytes , enc )
237
- self .fileencoding = enc
238
- return chars , False
239
- except UnicodeDecodeError :
240
- pass
241
- # Try ascii:
242
- try :
243
- chars = str (bytes , 'ascii' )
244
- self .fileencoding = None
245
- return chars , False
246
- except UnicodeDecodeError :
247
- pass
248
- # Try utf-8:
249
- try :
250
- chars = str (bytes , 'utf-8' )
251
- self .fileencoding = 'utf-8'
252
- return chars , False
253
- except UnicodeDecodeError :
254
- pass
255
- # Finally, try the locale's encoding. This is deprecated;
256
- # the user should declare a non-ASCII encoding
257
- try :
258
- # Wait for the editor window to appear
259
- self .editwin .text .update ()
260
- enc = askstring (
261
- "Specify file encoding" ,
262
- "The file's encoding is invalid for Python 3.x.\n "
263
- "IDLE will convert it to UTF-8.\n "
264
- "What is the current encoding of the file?" ,
265
- initialvalue = encoding ,
266
- parent = self .editwin .text )
267
-
268
- if enc :
269
- chars = str (bytes , enc )
270
- self .fileencoding = None
271
- return chars , True
272
- except (UnicodeDecodeError , LookupError ):
273
- pass
274
- return None , False # None on failure
275
-
276
173
def maybesave (self ):
277
174
if self .get_saved ():
278
175
return "yes"
@@ -360,38 +257,30 @@ def encode(self, chars):
360
257
# text to us. Don't try to guess further.
361
258
return chars
362
259
# Preserve a BOM that might have been present on opening
363
- if self .fileencoding == 'BOM ' :
364
- return BOM_UTF8 + chars .encode (" utf-8" )
260
+ if self .fileencoding == 'utf-8-sig ' :
261
+ return chars .encode (' utf-8-sig' )
365
262
# See whether there is anything non-ASCII in it.
366
263
# If not, no need to figure out the encoding.
367
264
try :
368
265
return chars .encode ('ascii' )
369
- except UnicodeError :
266
+ except UnicodeEncodeError :
370
267
pass
371
268
# Check if there is an encoding declared
372
269
try :
373
- # a string, let coding_spec slice it to the first two lines
374
- enc = coding_spec (chars )
375
- failed = None
376
- except LookupError as msg :
377
- failed = msg
378
- enc = None
379
- else :
380
- if not enc :
381
- # PEP 3120: default source encoding is UTF-8
382
- enc = 'utf-8'
383
- if enc :
384
- try :
385
- return chars .encode (enc )
386
- except UnicodeError :
387
- failed = "Invalid encoding '%s'" % enc
270
+ encoded = chars .encode ('ascii' , 'replace' )
271
+ enc , _ = tokenize .detect_encoding (io .BytesIO (encoded ).readline )
272
+ return chars .encode (enc )
273
+ except SyntaxError as err :
274
+ failed = str (err )
275
+ except UnicodeEncodeError :
276
+ failed = "Invalid encoding '%s'" % enc
388
277
tkMessageBox .showerror (
389
278
"I/O Error" ,
390
279
"%s.\n Saving as UTF-8" % failed ,
391
- parent = self .text )
280
+ parent = self .text )
392
281
# Fallback: save as UTF-8, with BOM - ignoring the incorrect
393
282
# declared encoding
394
- return BOM_UTF8 + chars .encode (" utf-8" )
283
+ return chars .encode (' utf-8-sig' )
395
284
396
285
def print_window (self , event ):
397
286
confirm = tkMessageBox .askokcancel (
0 commit comments