Skip to content

Commit c3fa753

Browse files
bpo-41158: IDLE: rewrite the code for handling file encoding (GH-21215)
(cherry picked from commit 694d31e) Co-authored-by: Serhiy Storchaka <[email protected]>
1 parent 42f05e6 commit c3fa753

File tree

1 file changed

+41
-152
lines changed

1 file changed

+41
-152
lines changed

Lib/idlelib/iomenu.py

+41-152
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,8 @@
1-
import codecs
2-
from codecs import BOM_UTF8
31
import os
4-
import re
52
import shlex
63
import sys
74
import tempfile
5+
import tokenize
86

97
import tkinter.filedialog as tkFileDialog
108
import tkinter.messagebox as tkMessageBox
@@ -20,49 +18,6 @@
2018
errors = 'surrogateescape'
2119

2220

23-
coding_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII)
24-
blank_re = re.compile(r'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
25-
26-
def coding_spec(data):
27-
"""Return the encoding declaration according to PEP 263.
28-
29-
When checking encoded data, only the first two lines should be passed
30-
in to avoid a UnicodeDecodeError if the rest of the data is not unicode.
31-
The first two lines would contain the encoding specification.
32-
33-
Raise a LookupError if the encoding is declared but unknown.
34-
"""
35-
if isinstance(data, bytes):
36-
# This encoding might be wrong. However, the coding
37-
# spec must be ASCII-only, so any non-ASCII characters
38-
# around here will be ignored. Decoding to Latin-1 should
39-
# never fail (except for memory outage)
40-
lines = data.decode('iso-8859-1')
41-
else:
42-
lines = data
43-
# consider only the first two lines
44-
if '\n' in lines:
45-
lst = lines.split('\n', 2)[:2]
46-
elif '\r' in lines:
47-
lst = lines.split('\r', 2)[:2]
48-
else:
49-
lst = [lines]
50-
for line in lst:
51-
match = coding_re.match(line)
52-
if match is not None:
53-
break
54-
if not blank_re.match(line):
55-
return None
56-
else:
57-
return None
58-
name = match.group(1)
59-
try:
60-
codecs.lookup(name)
61-
except LookupError:
62-
# The standard encoding error does not indicate the encoding
63-
raise LookupError("Unknown encoding: "+name)
64-
return name
65-
6621

6722
class IOBinding:
6823
# One instance per editor Window so methods know which to save, close.
@@ -78,7 +33,7 @@ def __init__(self, editwin):
7833
self.save_as)
7934
self.__id_savecopy = self.text.bind("<<save-copy-of-window-as-file>>",
8035
self.save_a_copy)
81-
self.fileencoding = None
36+
self.fileencoding = 'utf-8'
8237
self.__id_print = self.text.bind("<<print-window>>", self.print_window)
8338

8439
def close(self):
@@ -165,34 +120,44 @@ def open(self, event=None, editFile=None):
165120
self.text.focus_set()
166121
return "break"
167122

168-
eol = r"(\r\n)|\n|\r" # \r\n (Windows), \n (UNIX), or \r (Mac)
169-
eol_re = re.compile(eol)
170123
eol_convention = os.linesep # default
171124

172125
def loadfile(self, filename):
173126
try:
174-
# open the file in binary mode so that we can handle
175-
# end-of-line convention ourselves.
176-
with open(filename, 'rb') as f:
177-
two_lines = f.readline() + f.readline()
178-
f.seek(0)
179-
bytes = f.read()
180-
except OSError as msg:
181-
tkMessageBox.showerror("I/O Error", str(msg), parent=self.text)
127+
try:
128+
with tokenize.open(filename) as f:
129+
chars = f.read()
130+
fileencoding = f.encoding
131+
eol_convention = f.newlines
132+
converted = False
133+
except (UnicodeDecodeError, SyntaxError):
134+
# Wait for the editor window to appear
135+
self.editwin.text.update()
136+
enc = askstring(
137+
"Specify file encoding",
138+
"The file's encoding is invalid for Python 3.x.\n"
139+
"IDLE will convert it to UTF-8.\n"
140+
"What is the current encoding of the file?",
141+
initialvalue='utf-8',
142+
parent=self.editwin.text)
143+
with open(filename, encoding=enc) as f:
144+
chars = f.read()
145+
fileencoding = f.encoding
146+
eol_convention = f.newlines
147+
converted = True
148+
except OSError as err:
149+
tkMessageBox.showerror("I/O Error", str(err), parent=self.text)
182150
return False
183-
chars, converted = self._decode(two_lines, bytes)
184-
if chars is None:
151+
except UnicodeDecodeError:
185152
tkMessageBox.showerror("Decoding Error",
186153
"File %s\nFailed to Decode" % filename,
187154
parent=self.text)
188155
return False
189-
# We now convert all end-of-lines to '\n's
190-
firsteol = self.eol_re.search(chars)
191-
if firsteol:
192-
self.eol_convention = firsteol.group(0)
193-
chars = self.eol_re.sub(r"\n", chars)
156+
194157
self.text.delete("1.0", "end")
195158
self.set_filename(None)
159+
self.fileencoding = fileencoding
160+
self.eol_convention = eol_convention
196161
self.text.insert("1.0", chars)
197162
self.reset_undo()
198163
self.set_filename(filename)
@@ -205,74 +170,6 @@ def loadfile(self, filename):
205170
self.updaterecentfileslist(filename)
206171
return True
207172

208-
def _decode(self, two_lines, bytes):
209-
"Create a Unicode string."
210-
chars = None
211-
# Check presence of a UTF-8 signature first
212-
if bytes.startswith(BOM_UTF8):
213-
try:
214-
chars = bytes[3:].decode("utf-8")
215-
except UnicodeDecodeError:
216-
# has UTF-8 signature, but fails to decode...
217-
return None, False
218-
else:
219-
# Indicates that this file originally had a BOM
220-
self.fileencoding = 'BOM'
221-
return chars, False
222-
# Next look for coding specification
223-
try:
224-
enc = coding_spec(two_lines)
225-
except LookupError as name:
226-
tkMessageBox.showerror(
227-
title="Error loading the file",
228-
message="The encoding '%s' is not known to this Python "\
229-
"installation. The file may not display correctly" % name,
230-
parent = self.text)
231-
enc = None
232-
except UnicodeDecodeError:
233-
return None, False
234-
if enc:
235-
try:
236-
chars = str(bytes, enc)
237-
self.fileencoding = enc
238-
return chars, False
239-
except UnicodeDecodeError:
240-
pass
241-
# Try ascii:
242-
try:
243-
chars = str(bytes, 'ascii')
244-
self.fileencoding = None
245-
return chars, False
246-
except UnicodeDecodeError:
247-
pass
248-
# Try utf-8:
249-
try:
250-
chars = str(bytes, 'utf-8')
251-
self.fileencoding = 'utf-8'
252-
return chars, False
253-
except UnicodeDecodeError:
254-
pass
255-
# Finally, try the locale's encoding. This is deprecated;
256-
# the user should declare a non-ASCII encoding
257-
try:
258-
# Wait for the editor window to appear
259-
self.editwin.text.update()
260-
enc = askstring(
261-
"Specify file encoding",
262-
"The file's encoding is invalid for Python 3.x.\n"
263-
"IDLE will convert it to UTF-8.\n"
264-
"What is the current encoding of the file?",
265-
initialvalue = encoding,
266-
parent = self.editwin.text)
267-
268-
if enc:
269-
chars = str(bytes, enc)
270-
self.fileencoding = None
271-
return chars, True
272-
except (UnicodeDecodeError, LookupError):
273-
pass
274-
return None, False # None on failure
275-
276173
def maybesave(self):
277174
if self.get_saved():
278175
return "yes"
@@ -360,38 +257,30 @@ def encode(self, chars):
360257
# text to us. Don't try to guess further.
361258
return chars
362259
# Preserve a BOM that might have been present on opening
363-
if self.fileencoding == 'BOM':
364-
return BOM_UTF8 + chars.encode("utf-8")
260+
if self.fileencoding == 'utf-8-sig':
261+
return chars.encode('utf-8-sig')
365262
# See whether there is anything non-ASCII in it.
366263
# If not, no need to figure out the encoding.
367264
try:
368265
return chars.encode('ascii')
369-
except UnicodeError:
266+
except UnicodeEncodeError:
370267
pass
371268
# Check if there is an encoding declared
372269
try:
373-
# a string, let coding_spec slice it to the first two lines
374-
enc = coding_spec(chars)
375-
failed = None
376-
except LookupError as msg:
377-
failed = msg
378-
enc = None
379-
else:
380-
if not enc:
381-
# PEP 3120: default source encoding is UTF-8
382-
enc = 'utf-8'
383-
if enc:
384-
try:
385-
return chars.encode(enc)
386-
except UnicodeError:
387-
failed = "Invalid encoding '%s'" % enc
270+
encoded = chars.encode('ascii', 'replace')
271+
enc, _ = tokenize.detect_encoding(io.BytesIO(encoded).readline)
272+
return chars.encode(enc)
273+
except SyntaxError as err:
274+
failed = str(err)
275+
except UnicodeEncodeError:
276+
failed = "Invalid encoding '%s'" % enc
388277
tkMessageBox.showerror(
389278
"I/O Error",
390279
"%s.\nSaving as UTF-8" % failed,
391-
parent = self.text)
280+
parent=self.text)
392281
# Fallback: save as UTF-8, with BOM - ignoring the incorrect
393282
# declared encoding
394-
return BOM_UTF8 + chars.encode("utf-8")
283+
return chars.encode('utf-8-sig')
395284

396285
def print_window(self, event):
397286
confirm = tkMessageBox.askokcancel(

0 commit comments

Comments
 (0)