Detect and preserve original file encoding
This uses a few simple heuristics to detect file encoding before rewriting file contents. All file I/O is now binary, and decoding/encoding is explicit based on detected encoding. Cover a few common encodings (utf-8, ascii, windows-1250, windows-1252), and make it easy to add more encodings should it be necessary. Should make fix_includes.py behave better under Python3 with non-ASCII-encoded files.
This commit is contained in:
parent
eee0b0fc51
commit
5082fddccb
|
@ -504,9 +504,11 @@ class LineInfo(object):
|
|||
class FileInfo(object):
|
||||
""" Details about a file's storage encoding """
|
||||
DEFAULT_LINESEP = os.linesep
|
||||
DEFAULT_ENCODING = 'utf-8'
|
||||
|
||||
def __init__(self, linesep):
|
||||
def __init__(self, linesep, encoding):
|
||||
self.linesep = linesep
|
||||
self.encoding = encoding
|
||||
|
||||
@staticmethod
|
||||
def parse(filename):
|
||||
|
@ -515,7 +517,8 @@ class FileInfo(object):
|
|||
content = f.read()
|
||||
|
||||
linesep = FileInfo.guess_linesep(content)
|
||||
return FileInfo(linesep)
|
||||
encoding = FileInfo.guess_encoding(content)
|
||||
return FileInfo(linesep, encoding)
|
||||
|
||||
@staticmethod
|
||||
def guess_linesep(bytebuf):
|
||||
|
@ -529,11 +532,41 @@ class FileInfo(object):
|
|||
|
||||
return FileInfo.DEFAULT_LINESEP
|
||||
|
||||
@staticmethod
|
||||
def guess_encoding(bytebuf):
|
||||
""" Return approximate encoding for buffer.
|
||||
|
||||
This is heavily heuristic, and will return any supported encoding that can
|
||||
describe the file without losing information, not necessarily the *right*
|
||||
encoding. This is usually OK, because IWYU typically only adds ASCII content
|
||||
(or content pulled from the file itself).
|
||||
"""
|
||||
def try_decode(buf, encoding):
|
||||
try:
|
||||
buf.decode(encoding, errors='strict')
|
||||
except UnicodeError:
|
||||
return False
|
||||
return True
|
||||
|
||||
# Special-case UTF-8 BOM
|
||||
if bytebuf[0:3] == b'\xef\xbb\xbf':
|
||||
if try_decode(bytebuf, 'utf-8'):
|
||||
return 'utf-8'
|
||||
|
||||
encodings = ['ascii', 'utf-8', 'windows-1250', 'windows-1252']
|
||||
for encoding in encodings:
|
||||
if try_decode(bytebuf, encoding):
|
||||
return encoding
|
||||
|
||||
return FileInfo.DEFAULT_ENCODING
|
||||
|
||||
|
||||
def _ReadFile(filename, fileinfo):
|
||||
"""Read from filename and return a list of file lines."""
|
||||
try:
|
||||
return open(filename).read().splitlines()
|
||||
with open(filename, 'rb') as f:
|
||||
content = f.read()
|
||||
return content.decode(fileinfo.encoding).splitlines()
|
||||
except (IOError, OSError) as why:
|
||||
print("Skipping '%s': %s" % (filename, why))
|
||||
return None
|
||||
|
@ -542,10 +575,10 @@ def _ReadFile(filename, fileinfo):
|
|||
def _WriteFile(filename, fileinfo, file_lines):
|
||||
"""Write the given file-lines to the file."""
|
||||
try:
|
||||
# Open file in binary mode to preserve line endings
|
||||
with open(filename, 'wb') as f:
|
||||
f.write(fileinfo.linesep.join(file_lines))
|
||||
f.write(fileinfo.linesep)
|
||||
content = fileinfo.linesep.join(file_lines) + fileinfo.linesep
|
||||
content = content.encode(fileinfo.encoding)
|
||||
f.write(content)
|
||||
except (IOError, OSError) as why:
|
||||
print("Error writing '%s': %s" % (filename, why))
|
||||
|
||||
|
|
|
@ -51,7 +51,7 @@ class FixIncludesBase(unittest.TestCase):
|
|||
return self.before_map[filename]
|
||||
|
||||
def _ParseFileInfo(self, filename):
|
||||
return fix_includes.FileInfo('\n')
|
||||
return fix_includes.FileInfo('\n', 'utf-8')
|
||||
|
||||
def _WriteFile(self, filename, fileinfo, contents):
|
||||
return self.actual_after_contents.extend(contents)
|
||||
|
@ -3465,6 +3465,25 @@ class FileInfoTest(unittest.TestCase):
|
|||
self.assertEqual(fix_includes.FileInfo.DEFAULT_LINESEP,
|
||||
fix_includes.FileInfo.guess_linesep(buf))
|
||||
|
||||
def testEncodingASCII(self):
|
||||
buf = b'abcdefgh'
|
||||
self.assertEqual('ascii', fix_includes.FileInfo.guess_encoding(buf))
|
||||
|
||||
def testEncodingUTF8BOM(self):
|
||||
buf = b'\xef\xbb\xbfSomeASCIIButWithTheBOM'
|
||||
self.assertEqual('utf-8', fix_includes.FileInfo.guess_encoding(buf))
|
||||
|
||||
def testEncodingUTF8NoBOM(self):
|
||||
# This is a recurring test input in Swedish, translates to "shrimp sandwich"
|
||||
# and contains all three Swedish exotic characters.
|
||||
buf = b'r\xc3\xa4ksm\xc3\xb6rg\xc3\xa5s'
|
||||
self.assertEqual('utf-8', fix_includes.FileInfo.guess_encoding(buf))
|
||||
|
||||
def testEncodingISO8859_1(self):
|
||||
# Yours truly
|
||||
buf = b'Kim Gr\xe4sman'
|
||||
self.assertEqual('windows-1250', fix_includes.FileInfo.guess_encoding(buf))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
|
|
Loading…
Reference in New Issue