Detect and preserve original file encoding

This uses a few simple heuristics to detect file encoding before rewriting file contents. All file I/O is now binary, and decoding/encoding is explicit based on detected encoding. Cover a few common encodings (utf-8, ascii, windows-1250, windows-1252), and make it easy to add more encodings should it be necessary. Should make fix_includes.py behave better under Python3 with non-ASCII-encoded files.
2018-03-18 16:53:32 +01:00 · 2018-03-18 16:53:32 +01:00 · 5082fddccb
parent eee0b0fc51
commit 5082fddccb
2 changed files with 59 additions and 7 deletions
--- a/fix_includes.py
+++ b/fix_includes.py
@ -504,9 +504,11 @@ class LineInfo(object):
 class FileInfo(object):
  """ Details about a file's storage encoding  """
  DEFAULT_LINESEP = os.linesep
+  DEFAULT_ENCODING = 'utf-8'

-  def __init__(self, linesep):
+  def __init__(self, linesep, encoding):
    self.linesep = linesep
+    self.encoding = encoding

  @staticmethod
  def parse(filename):
@ -515,7 +517,8 @@ class FileInfo(object):
      content = f.read()

    linesep = FileInfo.guess_linesep(content)
-    return FileInfo(linesep)
+    encoding = FileInfo.guess_encoding(content)
+    return FileInfo(linesep, encoding)

  @staticmethod
  def guess_linesep(bytebuf):
@ -529,11 +532,41 @@ class FileInfo(object):

    return FileInfo.DEFAULT_LINESEP

+  @staticmethod
+  def guess_encoding(bytebuf):
+    """ Return approximate encoding for buffer.
+
+    This is heavily heuristic, and will return any supported encoding that can
+    describe the file without losing information, not necessarily the *right*
+    encoding. This is usually OK, because IWYU typically only adds ASCII content
+    (or content pulled from the file itself).
+    """
+    def try_decode(buf, encoding):
+      try:
+        buf.decode(encoding, errors='strict')
+      except UnicodeError:
+        return False
+      return True
+
+    # Special-case UTF-8 BOM
+    if bytebuf[0:3] == b'\xef\xbb\xbf':
+      if try_decode(bytebuf, 'utf-8'):
+        return 'utf-8'
+
+    encodings = ['ascii', 'utf-8', 'windows-1250', 'windows-1252']
+    for encoding in encodings:
+      if try_decode(bytebuf, encoding):
+        return encoding
+
+    return FileInfo.DEFAULT_ENCODING
+

 def _ReadFile(filename, fileinfo):
  """Read from filename and return a list of file lines."""
  try:
-    return open(filename).read().splitlines()
+    with open(filename, 'rb') as f:
+      content = f.read()
+      return content.decode(fileinfo.encoding).splitlines()
  except (IOError, OSError) as why:
    print("Skipping '%s': %s" % (filename, why))
  return None
@ -542,10 +575,10 @@ def _ReadFile(filename, fileinfo):
 def _WriteFile(filename, fileinfo, file_lines):
  """Write the given file-lines to the file."""
  try:
-    # Open file in binary mode to preserve line endings
    with open(filename, 'wb') as f:
-      f.write(fileinfo.linesep.join(file_lines))
-      f.write(fileinfo.linesep)
+      content = fileinfo.linesep.join(file_lines) + fileinfo.linesep
+      content = content.encode(fileinfo.encoding)
+      f.write(content)
  except (IOError, OSError) as why:
    print("Error writing '%s': %s" % (filename, why))

--- a/fix_includes_test.py
+++ b/fix_includes_test.py
@ -51,7 +51,7 @@ class FixIncludesBase(unittest.TestCase):
    return self.before_map[filename]

  def _ParseFileInfo(self, filename):
-      return fix_includes.FileInfo('\n')
+      return fix_includes.FileInfo('\n', 'utf-8')

  def _WriteFile(self, filename, fileinfo, contents):
      return self.actual_after_contents.extend(contents)
@ -3465,6 +3465,25 @@ class FileInfoTest(unittest.TestCase):
    self.assertEqual(fix_includes.FileInfo.DEFAULT_LINESEP,
                     fix_includes.FileInfo.guess_linesep(buf))

+  def testEncodingASCII(self):
+    buf = b'abcdefgh'
+    self.assertEqual('ascii', fix_includes.FileInfo.guess_encoding(buf))
+
+  def testEncodingUTF8BOM(self):
+    buf = b'\xef\xbb\xbfSomeASCIIButWithTheBOM'
+    self.assertEqual('utf-8', fix_includes.FileInfo.guess_encoding(buf))
+
+  def testEncodingUTF8NoBOM(self):
+    # This is a recurring test input in Swedish, translates to "shrimp sandwich"
+    # and contains all three Swedish exotic characters.
+    buf = b'r\xc3\xa4ksm\xc3\xb6rg\xc3\xa5s'
+    self.assertEqual('utf-8', fix_includes.FileInfo.guess_encoding(buf))
+
+  def testEncodingISO8859_1(self):
+    # Yours truly
+    buf = b'Kim Gr\xe4sman'
+    self.assertEqual('windows-1250', fix_includes.FileInfo.guess_encoding(buf))
+

 if __name__ == '__main__':
  unittest.main()