Commit 3db5dd5a authored by Cédric Bellegarde's avatar Cédric Bellegarde

First try to fix encoding detection

parent 6f385138
Pipeline #213573 passed with stage
in 3 minutes and 44 seconds
# Copyright (c) 2014-2020 Cedric Bellegarde <cedric.bellegarde@adishatz.org>
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
class FrameTag:
"""
Bytes representing a frame
"""
def __init__(self, bytes):
"""
Init tag reader
@param bytes as bytes
"""
try:
self.__key = bytes[0:4].decode("utf-8")
except:
self.__key = "None"
self.__bytes = bytes
@property
def frame(self):
"""
Get frame
@return bytes
"""
return self.__bytes[10:]
@property
def encoding(self):
"""
Get frame
@return bytes
"""
return self.frame[0:1]
@property
def key(self):
"""
Get frame key
@return str
"""
return self.__key
@property
def string(self):
"""
String representation of data
@return str
"""
return ""
# Copyright (c) 2014-2020 Cedric Bellegarde <cedric.bellegarde@adishatz.org>
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
from lollypop.logger import Logger
from lollypop.utils_file import decodeUnicode, splitUnicode
from lollypop.tag_frame import FrameTag
class FrameLangTag(FrameTag):
"""
Bytes representing a text with lang frame
"""
def __init__(self, bytes):
"""
Init tag reader
@param bytes as bytes
"""
FrameTag.__init__(self, bytes)
@property
def string(self):
"""
String representation of data
@return str/None
"""
try:
split = splitUnicode(self.frame[4:], self.encoding)
return decodeUnicode(split, self.encoding)
except Exception as e:
Logger.error("FrameLangTag::string: %s, %s", e, self.frame)
return ""
# Copyright (c) 2014-2020 Cedric Bellegarde <cedric.bellegarde@adishatz.org>
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
from lollypop.logger import Logger
from lollypop.utils_file import decodeUnicode, splitUnicode
from lollypop.tag_frame import FrameTag
class FrameTextTag(FrameTag):
"""
Bytes representing a text frame
"""
def __init__(self, bytes):
"""
Init tag reader
@param bytes as bytes
"""
FrameTag.__init__(self, bytes)
@property
def string(self):
"""
String representation of data
@return str/None
"""
try:
split = splitUnicode(self.frame, self.encoding)
return decodeUnicode(split, self.encoding)
except Exception as e:
Logger.error("FrameTextTag::string: %s, %s", e, self.frame)
return ""
......@@ -19,6 +19,8 @@ from lollypop.define import App
from lollypop.logger import Logger
from lollypop.utils_file import decodeUnicode, splitUnicode
from lollypop.utils import format_artist_name, get_iso_date_from_string
from lollypop.tag_frame_text import FrameTextTag
from lollypop.tag_frame_lang import FrameLangTag
class Discoverer:
......@@ -346,9 +348,11 @@ class TagReader:
(exists, m) = sample.get_buffer().map(Gst.MapFlags.READ)
if not exists:
continue
prefix = m.data[0:4]
if prefix in [b"TCMP"]:
string = self.__get_string_from_bytes(m.data, 0)
frame = FrameTextTag(m.data)
if frame.key == "TCMP":
string = frame.string
if not string:
Logger.debug(tags.to_string())
return string and string[-1] == "1"
size = tags.get_tag_size("extended-comment")
for i in range(0, size):
......@@ -438,10 +442,11 @@ class TagReader:
(exists, m) = sample.get_buffer().map(Gst.MapFlags.READ)
if not exists:
continue
prefix = m.data[0:4]
if prefix in [b"TDOR"]:
string = self.__get_string_from_bytes(m.data, 0)
date = get_iso_date_from_string(string)
frame = FrameTextTag(m.data)
if frame.key == "TDOR":
if not frame.string:
Logger.debug(tags.to_string())
date = get_iso_date_from_string(frame.string)
datetime = GLib.DateTime.new_from_iso8601(date, None)
return (datetime.get_year(), datetime.to_unix())
except:
......@@ -570,9 +575,9 @@ class TagReader:
"""
def decode_lyrics(bytes):
try:
prefix = bytes[0:4]
if prefix in [b"USLT"]:
return self.__get_string_from_bytes(bytes, 4)
frame = FrameLangTag(bytes)
if frame.key == "USLT":
return frame.string
except Exception as e:
Logger.warning("TagReader::get_lyrics(): %s", e)
return None
......@@ -791,18 +796,3 @@ class TagReader:
#######################
# PRIVATE #
#######################
def __get_string_from_bytes(self, bytes, shift):
"""
Get tag string from frame
@param bytes as bytes
@param shift as int (ex: 4 for lyrics, to skip lang)
@return str
"""
try:
frame = bytes[10:]
encoding = frame[0:1]
(d, t) = splitUnicode(frame[shift:], encoding)
return decodeUnicode(t, encoding)
except Exception as e:
Logger.error("TagReader::__get_string_from_bytes(): %s", e)
return ""
......@@ -200,29 +200,22 @@ def id3EncodingToString(encoding):
def decodeUnicode(bites, encoding):
codec = id3EncodingToString(encoding)
Logger.debug("Unicode encoding: %s" % codec)
if (codec.startswith("utf_16") and
len(bites) % 2 != 0 and bites[-1:] == b"\x00"):
# Catch and fix bad utf16 data, it is everywhere.
Logger.warning("Fixing utf16 data with extra zero bytes")
bites = bites[:-1]
return bites.decode(codec).rstrip("\x00")
return bites.decode(codec)
def splitUnicode(data, encoding):
from lollypop.define import LATIN1_ENCODING, UTF_8_ENCODING
from lollypop.define import UTF_16_ENCODING, UTF_16BE_ENCODING
try:
if encoding == LATIN1_ENCODING or encoding == UTF_8_ENCODING:
(d, t) = data.split(b"\x00", 1)
elif encoding == UTF_16_ENCODING or encoding == UTF_16BE_ENCODING:
# Two null bytes split, but since each utf16 char is also two
# bytes we need to ensure we found a proper boundary.
(d, t) = data.split(b"\x00\x00", 1)
if (len(d) % 2) != 0:
(d, t) = data.split(b"\x00\x00\x00", 1)
d += b"\x00"
except ValueError as ex:
Logger.warning("Invalid 2-tuple ID3 frame data: %s", ex)
d, t = data, b""
return (d, t)
# From eyeD3 end
if encoding == LATIN1_ENCODING or encoding == UTF_8_ENCODING:
if data.find(b"\x00") != -1:
(d, split) = data.split(b"\x00", 1)
else:
split = data
elif encoding == UTF_16_ENCODING or encoding == UTF_16BE_ENCODING:
if data.find(b"\x00\x00") != -1:
(d, split) = data.split(b"\x00\x00", 1)
else:
(d, split) = data.split(b"\xff", 1)
if len(split) % 2 != 0:
split += b"\x00"
return split
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment