bpo-41158: IDLE: rewrite the code for handling file encoding (GH-21215)

miss-islington · serhiy-storchaka · web-flow · commit c3fa7534c717 · 2020-07-01T11:22:45.000-07:00
(cherry picked from commit 694d31e) Co-authored-by: Serhiy Storchaka <storchaka@gmail.com>
diff --git a/Lib/idlelib/iomenu.py b/Lib/idlelib/iomenu.py
@@ -1,10 +1,8 @@
-import codecs
-from codecs import BOM_UTF8
 import os
-import re
 import shlex
 import sys
 import tempfile
+import tokenize
 
 import tkinter.filedialog as tkFileDialog
 import tkinter.messagebox as tkMessageBox
@@ -20,49 +18,6 @@
     errors = 'surrogateescape'
 
 
-coding_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII)
-blank_re = re.compile(r'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
-
-def coding_spec(data):
-    """Return the encoding declaration according to PEP 263.
-
-    When checking encoded data, only the first two lines should be passed
-    in to avoid a UnicodeDecodeError if the rest of the data is not unicode.
-    The first two lines would contain the encoding specification.
-
-    Raise a LookupError if the encoding is declared but unknown.
-    """
-    if isinstance(data, bytes):
-        # This encoding might be wrong. However, the coding
-        # spec must be ASCII-only, so any non-ASCII characters
-        # around here will be ignored. Decoding to Latin-1 should
-        # never fail (except for memory outage)
-        lines = data.decode('iso-8859-1')
-    else:
-        lines = data
-    # consider only the first two lines
-    if '\n' in lines:
-        lst = lines.split('\n', 2)[:2]
-    elif '\r' in lines:
-        lst = lines.split('\r', 2)[:2]
-    else:
-        lst = [lines]
-    for line in lst:
-        match = coding_re.match(line)
-        if match is not None:
-            break
-        if not blank_re.match(line):
-            return None
-    else:
-        return None
-    name = match.group(1)
-    try:
-        codecs.lookup(name)
-    except LookupError:
-        # The standard encoding error does not indicate the encoding
-        raise LookupError("Unknown encoding: "+name)
-    return name
-
 
 class IOBinding:
 # One instance per editor Window so methods know which to save, close.
@@ -78,7 +33,7 @@ def __init__(self, editwin):
                                           self.save_as)
         self.__id_savecopy = self.text.bind("<<save-copy-of-window-as-file>>",
                                             self.save_a_copy)
-        self.fileencoding = None
+        self.fileencoding = 'utf-8'
         self.__id_print = self.text.bind("<<print-window>>", self.print_window)
 
     def close(self):
@@ -165,34 +120,44 @@ def open(self, event=None, editFile=None):
             self.text.focus_set()
         return "break"
 
-    eol = r"(\r\n)|\n|\r"  # \r\n (Windows), \n (UNIX), or \r (Mac)
-    eol_re = re.compile(eol)
     eol_convention = os.linesep  # default
 
     def loadfile(self, filename):
         try:
-            # open the file in binary mode so that we can handle
-            # end-of-line convention ourselves.
-            with open(filename, 'rb') as f:
-                two_lines = f.readline() + f.readline()
-                f.seek(0)
-                bytes = f.read()
-        except OSError as msg:
-            tkMessageBox.showerror("I/O Error", str(msg), parent=self.text)
+            try:
+                with tokenize.open(filename) as f:
+                    chars = f.read()
+                    fileencoding = f.encoding
+                    eol_convention = f.newlines
+                    converted = False
+            except (UnicodeDecodeError, SyntaxError):
+                # Wait for the editor window to appear
+                self.editwin.text.update()
+                enc = askstring(
+                    "Specify file encoding",
+                    "The file's encoding is invalid for Python 3.x.\n"
+                    "IDLE will convert it to UTF-8.\n"
+                    "What is the current encoding of the file?",
+                    initialvalue='utf-8',
+                    parent=self.editwin.text)
+                with open(filename, encoding=enc) as f:
+                    chars = f.read()
+                    fileencoding = f.encoding
+                    eol_convention = f.newlines
+                    converted = True
+        except OSError as err:
+            tkMessageBox.showerror("I/O Error", str(err), parent=self.text)
             return False
-        chars, converted = self._decode(two_lines, bytes)
-        if chars is None:
+        except UnicodeDecodeError:
             tkMessageBox.showerror("Decoding Error",
                                    "File %s\nFailed to Decode" % filename,
                                    parent=self.text)
             return False
-        # We now convert all end-of-lines to '\n's
-        firsteol = self.eol_re.search(chars)
-        if firsteol:
-            self.eol_convention = firsteol.group(0)
-            chars = self.eol_re.sub(r"\n", chars)
+
         self.text.delete("1.0", "end")
         self.set_filename(None)
+        self.fileencoding = fileencoding
+        self.eol_convention = eol_convention
         self.text.insert("1.0", chars)
         self.reset_undo()
         self.set_filename(filename)
@@ -205,74 +170,6 @@ def loadfile(self, filename):
         self.updaterecentfileslist(filename)
         return True
 
-    def _decode(self, two_lines, bytes):
-        "Create a Unicode string."
-        chars = None
-        # Check presence of a UTF-8 signature first
-        if bytes.startswith(BOM_UTF8):
-            try:
-                chars = bytes[3:].decode("utf-8")
-            except UnicodeDecodeError:
-                # has UTF-8 signature, but fails to decode...
-                return None, False
-            else:
-                # Indicates that this file originally had a BOM
-                self.fileencoding = 'BOM'
-                return chars, False
-        # Next look for coding specification
-        try:
-            enc = coding_spec(two_lines)
-        except LookupError as name:
-            tkMessageBox.showerror(
-                title="Error loading the file",
-                message="The encoding '%s' is not known to this Python "\
-                "installation. The file may not display correctly" % name,
-                parent = self.text)
-            enc = None
-        except UnicodeDecodeError:
-            return None, False
-        if enc:
-            try:
-                chars = str(bytes, enc)
-                self.fileencoding = enc
-                return chars, False
-            except UnicodeDecodeError:
-                pass
-        # Try ascii:
-        try:
-            chars = str(bytes, 'ascii')
-            self.fileencoding = None
-            return chars, False
-        except UnicodeDecodeError:
-            pass
-        # Try utf-8:
-        try:
-            chars = str(bytes, 'utf-8')
-            self.fileencoding = 'utf-8'
-            return chars, False
-        except UnicodeDecodeError:
-            pass
-        # Finally, try the locale's encoding. This is deprecated;
-        # the user should declare a non-ASCII encoding
-        try:
-            # Wait for the editor window to appear
-            self.editwin.text.update()
-            enc = askstring(
-                "Specify file encoding",
-                "The file's encoding is invalid for Python 3.x.\n"
-                "IDLE will convert it to UTF-8.\n"
-                "What is the current encoding of the file?",
-                initialvalue = encoding,
-                parent = self.editwin.text)
-
-            if enc:
-                chars = str(bytes, enc)
-                self.fileencoding = None
-            return chars, True
-        except (UnicodeDecodeError, LookupError):
-            pass
-        return None, False  # None on failure
-
     def maybesave(self):
         if self.get_saved():
             return "yes"
@@ -360,38 +257,30 @@ def encode(self, chars):
             # text to us. Don't try to guess further.
             return chars
         # Preserve a BOM that might have been present on opening
-        if self.fileencoding == 'BOM':
-            return BOM_UTF8 + chars.encode("utf-8")
+        if self.fileencoding == 'utf-8-sig':
+            return chars.encode('utf-8-sig')
         # See whether there is anything non-ASCII in it.
         # If not, no need to figure out the encoding.
         try:
             return chars.encode('ascii')
-        except UnicodeError:
+        except UnicodeEncodeError:
             pass
         # Check if there is an encoding declared
         try:
-            # a string, let coding_spec slice it to the first two lines
-            enc = coding_spec(chars)
-            failed = None
-        except LookupError as msg:
-            failed = msg
-            enc = None
-        else:
-            if not enc:
-                # PEP 3120: default source encoding is UTF-8
-                enc = 'utf-8'
-        if enc:
-            try:
-                return chars.encode(enc)
-            except UnicodeError:
-                failed = "Invalid encoding '%s'" % enc
+            encoded = chars.encode('ascii', 'replace')
+            enc, _ = tokenize.detect_encoding(io.BytesIO(encoded).readline)
+            return chars.encode(enc)
+        except SyntaxError as err:
+            failed = str(err)
+        except UnicodeEncodeError:
+            failed = "Invalid encoding '%s'" % enc
         tkMessageBox.showerror(
             "I/O Error",
             "%s.\nSaving as UTF-8" % failed,
-            parent = self.text)
+            parent=self.text)
         # Fallback: save as UTF-8, with BOM - ignoring the incorrect
         # declared encoding
-        return BOM_UTF8 + chars.encode("utf-8")
+        return chars.encode('utf-8-sig')
 
     def print_window(self, event):
         confirm = tkMessageBox.askokcancel(