Bug 82651 - Tika&Sphinx&chardet content indexing (done!)

git-svn-id: svn://svn.office.custis.ru/3rdparty/viewvc.org/trunk@1389 6955db30-a419-402b-8a0d-67ecbb4d7f56
remotes/github/custis
vfilippov 2011-09-27 16:34:42 +00:00 committed by Vitaliy Filippov
parent e363cf19b1
commit 639d1c25db
1 changed files with 8 additions and 1 deletions

View File

@ -46,7 +46,14 @@ class ContentMagic:
charset = chardet.detect(content)
if charset and charset['encoding']:
charset = charset['encoding']
content = content.decode(charset)
if charset == 'MacCyrillic':
# Silly MacCyr, try cp1251
try:
content = content.decode('windows-1251')
charset = 'windows-1251'
except: content = content.decode(charset)
else:
content = content.decode(charset)
except: charset = None
else:
# Try UTF-8