Detect and preserve original file encoding

This uses a few simple heuristics to detect file encoding before
rewriting file contents.

All file I/O is now binary, and decoding/encoding is explicit based on
detected encoding.

Cover a few common encodings (utf-8, ascii, windows-1250, windows-1252),
and make it easy to add more encodings should it be necessary.

Should make fix_includes.py behave better under Python3 with
non-ASCII-encoded files.
This commit is contained in:
Kim Grasman 2018-03-18 16:53:32 +01:00 committed by Kim Gräsman
parent eee0b0fc51
commit 5082fddccb
2 changed files with 59 additions and 7 deletions

View File

@ -504,9 +504,11 @@ class LineInfo(object):
class FileInfo(object):
""" Details about a file's storage encoding """
DEFAULT_LINESEP = os.linesep
DEFAULT_ENCODING = 'utf-8'
def __init__(self, linesep):
def __init__(self, linesep, encoding):
self.linesep = linesep
self.encoding = encoding
@staticmethod
def parse(filename):
@ -515,7 +517,8 @@ class FileInfo(object):
content = f.read()
linesep = FileInfo.guess_linesep(content)
return FileInfo(linesep)
encoding = FileInfo.guess_encoding(content)
return FileInfo(linesep, encoding)
@staticmethod
def guess_linesep(bytebuf):
@ -529,11 +532,41 @@ class FileInfo(object):
return FileInfo.DEFAULT_LINESEP
@staticmethod
def guess_encoding(bytebuf):
""" Return approximate encoding for buffer.
This is heavily heuristic, and will return any supported encoding that can
describe the file without losing information, not necessarily the *right*
encoding. This is usually OK, because IWYU typically only adds ASCII content
(or content pulled from the file itself).
"""
def try_decode(buf, encoding):
try:
buf.decode(encoding, errors='strict')
except UnicodeError:
return False
return True
# Special-case UTF-8 BOM
if bytebuf[0:3] == b'\xef\xbb\xbf':
if try_decode(bytebuf, 'utf-8'):
return 'utf-8'
encodings = ['ascii', 'utf-8', 'windows-1250', 'windows-1252']
for encoding in encodings:
if try_decode(bytebuf, encoding):
return encoding
return FileInfo.DEFAULT_ENCODING
def _ReadFile(filename, fileinfo):
"""Read from filename and return a list of file lines."""
try:
return open(filename).read().splitlines()
with open(filename, 'rb') as f:
content = f.read()
return content.decode(fileinfo.encoding).splitlines()
except (IOError, OSError) as why:
print("Skipping '%s': %s" % (filename, why))
return None
@ -542,10 +575,10 @@ def _ReadFile(filename, fileinfo):
def _WriteFile(filename, fileinfo, file_lines):
"""Write the given file-lines to the file."""
try:
# Open file in binary mode to preserve line endings
with open(filename, 'wb') as f:
f.write(fileinfo.linesep.join(file_lines))
f.write(fileinfo.linesep)
content = fileinfo.linesep.join(file_lines) + fileinfo.linesep
content = content.encode(fileinfo.encoding)
f.write(content)
except (IOError, OSError) as why:
print("Error writing '%s': %s" % (filename, why))

View File

@ -51,7 +51,7 @@ class FixIncludesBase(unittest.TestCase):
return self.before_map[filename]
def _ParseFileInfo(self, filename):
return fix_includes.FileInfo('\n')
return fix_includes.FileInfo('\n', 'utf-8')
def _WriteFile(self, filename, fileinfo, contents):
return self.actual_after_contents.extend(contents)
@ -3465,6 +3465,25 @@ class FileInfoTest(unittest.TestCase):
self.assertEqual(fix_includes.FileInfo.DEFAULT_LINESEP,
fix_includes.FileInfo.guess_linesep(buf))
def testEncodingASCII(self):
buf = b'abcdefgh'
self.assertEqual('ascii', fix_includes.FileInfo.guess_encoding(buf))
def testEncodingUTF8BOM(self):
buf = b'\xef\xbb\xbfSomeASCIIButWithTheBOM'
self.assertEqual('utf-8', fix_includes.FileInfo.guess_encoding(buf))
def testEncodingUTF8NoBOM(self):
# This is a recurring test input in Swedish, translates to "shrimp sandwich"
# and contains all three Swedish exotic characters.
buf = b'r\xc3\xa4ksm\xc3\xb6rg\xc3\xa5s'
self.assertEqual('utf-8', fix_includes.FileInfo.guess_encoding(buf))
def testEncodingISO8859_1(self):
# Yours truly
buf = b'Kim Gr\xe4sman'
self.assertEqual('windows-1250', fix_includes.FileInfo.guess_encoding(buf))
if __name__ == '__main__':
unittest.main()