Source code for magic

"""
magic is a wrapper around the libmagic file identification library.

See README for more information.

Usage:

>>> import magic
>>> magic.from_file("testdata/test.pdf")
'PDF document, version 1.2'
>>> magic.from_file("testdata/test.pdf", mime=True)
'application/pdf'
>>> magic.from_buffer(open("testdata/test.pdf").read(1024))
'PDF document, version 1.2'
>>>

"""

import sys
import glob
import ctypes
import ctypes.util
import threading
import logging

from ctypes import c_char_p, c_int, c_size_t, c_void_p, byref, POINTER

# avoid shadowing the real open with the version from compat.py
_real_open = open


class MagicException(Exception):
    def __init__(self, message):
        super(Exception, self).__init__(message)
        self.message = message


class Magic:
    """
    Magic is a wrapper around the libmagic C library.
    """

    def __init__(self, mime=False, magic_file=None, mime_encoding=False,
                 keep_going=False, uncompress=False, raw=False, extension=False):
        """
        Create a new libmagic wrapper.

        mime - if True, mimetypes are returned instead of textual descriptions
        mime_encoding - if True, codec is returned
        magic_file - use a mime database other than the system default
        keep_going - don't stop at the first match, keep going
        uncompress - Try to look inside compressed files.
        raw - Do not try to decode "non-printable" chars.
        extension - Print a slash-separated list of valid extensions for the file type found.
        """
        self.flags = MAGIC_NONE
        if mime:
            self.flags |= MAGIC_MIME_TYPE
        if mime_encoding:
            self.flags |= MAGIC_MIME_ENCODING
        if keep_going:
            self.flags |= MAGIC_CONTINUE
        if uncompress:
            self.flags |= MAGIC_COMPRESS
        if raw:
            self.flags |= MAGIC_RAW
        if extension:
            self.flags |= MAGIC_EXTENSION

        self.cookie = magic_open(self.flags)
        self.lock = threading.Lock()

        magic_load(self.cookie, magic_file)

        # MAGIC_EXTENSION was added in 523 or 524, so bail if
        # it doesn't appear to be available
        if extension and (not _has_version or version() < 524):
            raise NotImplementedError('MAGIC_EXTENSION is not supported in this version of libmagic')

        # For https://github.com/ahupp/python-magic/issues/190
        # libmagic has fixed internal limits that some files exceed, causing
        # an error.  We can avoid this (at least for the sample file given)
        # by bumping the limit up.  It's not clear if this is a general solution
        # or whether other internal limits should be increased, but given
        # the lack of other reports I'll assume this is rare.
        if _has_param:
            try:
                self.setparam(MAGIC_PARAM_NAME_MAX, 64)
            except MagicException as e:
                # some versions of libmagic fail this call,
                # so rather than fail hard just use default behavior
                pass

    def from_buffer(self, buf):
        """
        Identify the contents of `buf`
        """
        with self.lock:
            try:
                # if we're on python3, convert buf to bytes
                # otherwise this string is passed as wchar*
                # which is not what libmagic expects
                # NEXTBREAK: only take bytes
                if type(buf) == str and str != bytes:
                    buf = buf.encode('utf-8', errors='replace')
                return maybe_decode(magic_buffer(self.cookie, buf))
            except MagicException as e:
                return self._handle509Bug(e)

    def from_file(self, filename):
        # raise FileNotFoundException or IOError if the file does not exist
        with _real_open(filename):
            pass

        with self.lock:
            try:
                return maybe_decode(magic_file(self.cookie, filename))
            except MagicException as e:
                return self._handle509Bug(e)

    def from_descriptor(self, fd):
        with self.lock:
            try:
                return maybe_decode(magic_descriptor(self.cookie, fd))
            except MagicException as e:
                return self._handle509Bug(e)

    def _handle509Bug(self, e):
        # libmagic 5.09 has a bug where it might fail to identify the
        # mimetype of a file and returns null from magic_file (and
        # likely _buffer), but also does not return an error message.
        if e.message is None and (self.flags & MAGIC_MIME_TYPE):
            return "application/octet-stream"
        else:
            raise e

    def setparam(self, param, val):
        return magic_setparam(self.cookie, param, val)

    def getparam(self, param):
        return magic_getparam(self.cookie, param)

    def __del__(self):
        # no _thread_check here because there can be no other
        # references to this object at this point.

        # during shutdown magic_close may have been cleared already so
        # make sure it exists before using it.

        # the self.cookie check should be unnecessary and was an
        # incorrect fix for a threading problem, however I'm leaving
        # it in because it's harmless and I'm slightly afraid to
        # remove it.
        if hasattr(self, 'cookie') and self.cookie and magic_close:
            magic_close(self.cookie)
            self.cookie = None


_instances = {}


def _get_magic_type(mime):
    i = _instances.get(mime)
    if i is None:
        i = _instances[mime] = Magic(mime=mime)
    return i


def from_file(filename, mime=False):
    """"
    Accepts a filename and returns the detected filetype.  Return
    value is the mimetype if mime=True, otherwise a human readable
    name.

    >>> magic.from_file("testdata/test.pdf", mime=True)
    'application/pdf'
    """
    m = _get_magic_type(mime)
    return m.from_file(filename)


def from_buffer(buffer, mime=False):
    """
    Accepts a binary string and returns the detected filetype.  Return
    value is the mimetype if mime=True, otherwise a human readable
    name.

    >>> magic.from_buffer(open("testdata/test.pdf").read(1024))
    'PDF document, version 1.2'
    """
    m = _get_magic_type(mime)
    return m.from_buffer(buffer)


def from_descriptor(fd, mime=False):
    """
    Accepts a file descriptor and returns the detected filetype.  Return
    value is the mimetype if mime=True, otherwise a human readable
    name.

    >>> f = open("testdata/test.pdf")
    >>> magic.from_descriptor(f.fileno())
    'PDF document, version 1.2'
    """
    m = _get_magic_type(mime)
    return m.from_descriptor(fd)

from . import loader
libmagic = loader.load_lib()

magic_t = ctypes.c_void_p


def errorcheck_null(result, func, args):
    if result is None:
        err = magic_error(args[0])
        raise MagicException(err)
    else:
        return result


def errorcheck_negative_one(result, func, args):
    if result == -1:
        err = magic_error(args[0])
        raise MagicException(err)
    else:
        return result


# return str on python3.  Don't want to unconditionally
# decode because that results in unicode on python2
def maybe_decode(s):
    # NEXTBREAK: remove
    if str == bytes:
        return s
    else:
        # backslashreplace here because sometimes libmagic will return metadata in the charset
        # of the file, which is unknown to us (e.g the title of a Word doc)
        return s.decode('utf-8', 'backslashreplace')


try:
    from os import PathLike
    def unpath(filename):
        if isinstance(filename, PathLike):
            return filename.__fspath__()
        else:
            return filename
except ImportError:
    def unpath(filename):
        return filename

def coerce_filename(filename):
    if filename is None:
        return None

    filename = unpath(filename)

    # ctypes will implicitly convert unicode strings to bytes with
    # .encode('ascii').  If you use the filesystem encoding
    # then you'll get inconsistent behavior (crashes) depending on the user's
    # LANG environment variable
    # NEXTBREAK: remove
    is_unicode = (sys.version_info[0] <= 2 and
                 isinstance(filename, unicode)) or \
                 (sys.version_info[0] >= 3 and
                  isinstance(filename, str))
    if is_unicode:
        return filename.encode('utf-8', 'surrogateescape')
    else:
        return filename


magic_open = libmagic.magic_open
magic_open.restype = magic_t
magic_open.argtypes = [c_int]

magic_close = libmagic.magic_close
magic_close.restype = None
magic_close.argtypes = [magic_t]

magic_error = libmagic.magic_error
magic_error.restype = c_char_p
magic_error.argtypes = [magic_t]

magic_errno = libmagic.magic_errno
magic_errno.restype = c_int
magic_errno.argtypes = [magic_t]

_magic_file = libmagic.magic_file
_magic_file.restype = c_char_p
_magic_file.argtypes = [magic_t, c_char_p]
_magic_file.errcheck = errorcheck_null


def magic_file(cookie, filename):
    return _magic_file(cookie, coerce_filename(filename))


_magic_buffer = libmagic.magic_buffer
_magic_buffer.restype = c_char_p
_magic_buffer.argtypes = [magic_t, c_void_p, c_size_t]
_magic_buffer.errcheck = errorcheck_null


def magic_buffer(cookie, buf):
    return _magic_buffer(cookie, buf, len(buf))


magic_descriptor = libmagic.magic_descriptor
magic_descriptor.restype = c_char_p
magic_descriptor.argtypes = [magic_t, c_int]
magic_descriptor.errcheck = errorcheck_null

_magic_descriptor = libmagic.magic_descriptor
_magic_descriptor.restype = c_char_p
_magic_descriptor.argtypes = [magic_t, c_int]
_magic_descriptor.errcheck = errorcheck_null


def magic_descriptor(cookie, fd):
    return _magic_descriptor(cookie, fd)


_magic_load = libmagic.magic_load
_magic_load.restype = c_int
_magic_load.argtypes = [magic_t, c_char_p]
_magic_load.errcheck = errorcheck_negative_one


def magic_load(cookie, filename):
    return _magic_load(cookie, coerce_filename(filename))


magic_setflags = libmagic.magic_setflags
magic_setflags.restype = c_int
magic_setflags.argtypes = [magic_t, c_int]

magic_check = libmagic.magic_check
magic_check.restype = c_int
magic_check.argtypes = [magic_t, c_char_p]

magic_compile = libmagic.magic_compile
magic_compile.restype = c_int
magic_compile.argtypes = [magic_t, c_char_p]

_has_param = False
if hasattr(libmagic, 'magic_setparam') and hasattr(libmagic, 'magic_getparam'):
    _has_param = True
    _magic_setparam = libmagic.magic_setparam
    _magic_setparam.restype = c_int
    _magic_setparam.argtypes = [magic_t, c_int, POINTER(c_size_t)]
    _magic_setparam.errcheck = errorcheck_negative_one

    _magic_getparam = libmagic.magic_getparam
    _magic_getparam.restype = c_int
    _magic_getparam.argtypes = [magic_t, c_int, POINTER(c_size_t)]
    _magic_getparam.errcheck = errorcheck_negative_one


def magic_setparam(cookie, param, val):
    if not _has_param:
        raise NotImplementedError("magic_setparam not implemented")
    v = c_size_t(val)
    return _magic_setparam(cookie, param, byref(v))


def magic_getparam(cookie, param):
    if not _has_param:
        raise NotImplementedError("magic_getparam not implemented")
    val = c_size_t()
    _magic_getparam(cookie, param, byref(val))
    return val.value


_has_version = False
if hasattr(libmagic, "magic_version"):
    _has_version = True
    magic_version = libmagic.magic_version
    magic_version.restype = c_int
    magic_version.argtypes = []


def version():
    if not _has_version:
        raise NotImplementedError("magic_version not implemented")
    return magic_version()


MAGIC_NONE = 0x000000  # No flags
MAGIC_DEBUG = 0x000001  # Turn on debugging
MAGIC_SYMLINK = 0x000002  # Follow symlinks
MAGIC_COMPRESS = 0x000004  # Check inside compressed files
MAGIC_DEVICES = 0x000008  # Look at the contents of devices
MAGIC_MIME_TYPE = 0x000010  # Return a mime string
MAGIC_MIME_ENCODING = 0x000400  # Return the MIME encoding
# TODO:  should be
# MAGIC_MIME = MAGIC_MIME_TYPE | MAGIC_MIME_ENCODING
MAGIC_MIME = 0x000010  # Return a mime string
MAGIC_EXTENSION = 0x1000000  # Return a /-separated list of extensions

MAGIC_CONTINUE = 0x000020  # Return all matches
MAGIC_CHECK = 0x000040  # Print warnings to stderr
MAGIC_PRESERVE_ATIME = 0x000080  # Restore access time on exit
MAGIC_RAW = 0x000100  # Don't translate unprintable chars
MAGIC_ERROR = 0x000200  # Handle ENOENT etc as real errors

MAGIC_NO_CHECK_COMPRESS = 0x001000  # Don't check for compressed files
MAGIC_NO_CHECK_TAR = 0x002000  # Don't check for tar files
MAGIC_NO_CHECK_SOFT = 0x004000  # Don't check magic entries
MAGIC_NO_CHECK_APPTYPE = 0x008000  # Don't check application type
MAGIC_NO_CHECK_ELF = 0x010000  # Don't check for elf details
MAGIC_NO_CHECK_ASCII = 0x020000  # Don't check for ascii files
MAGIC_NO_CHECK_TROFF = 0x040000  # Don't check ascii/troff
MAGIC_NO_CHECK_FORTRAN = 0x080000  # Don't check ascii/fortran
MAGIC_NO_CHECK_TOKENS = 0x100000  # Don't check ascii/tokens

MAGIC_PARAM_INDIR_MAX = 0  # Recursion limit for indirect magic
MAGIC_PARAM_NAME_MAX = 1  # Use count limit for name/use magic
MAGIC_PARAM_ELF_PHNUM_MAX = 2  # Max ELF notes processed
MAGIC_PARAM_ELF_SHNUM_MAX = 3  # Max ELF program sections processed
MAGIC_PARAM_ELF_NOTES_MAX = 4  # # Max ELF sections processed
MAGIC_PARAM_REGEX_MAX = 5  # Length limit for regex searches
MAGIC_PARAM_BYTES_MAX = 6  # Max number of bytes to read from file


# This package name conflicts with the one provided by upstream
# libmagic.  This is a common source of confusion for users.  To
# resolve, We ship a copy of that module, and expose it's functions
# wrapped in deprecation warnings.
def _add_compat(to_module):
    import warnings, re
    from magic import compat

    def deprecation_wrapper(fn):
        def _(*args, **kwargs):
            warnings.warn(
                "Using compatibility mode with libmagic's python binding. "
                "See https://github.com/ahupp/python-magic/blob/master/COMPAT.md for details.",
                PendingDeprecationWarning)

            return fn(*args, **kwargs)

        return _

    fn = ['detect_from_filename',
          'detect_from_content',
          'detect_from_fobj',
          'open']
    for fname in fn:
        to_module[fname] = deprecation_wrapper(compat.__dict__[fname])

    # copy constants over, ensuring there's no conflicts
    is_const_re = re.compile("^[A-Z_]+$")
    allowed_inconsistent = set(['MAGIC_MIME'])
    for name, value in compat.__dict__.items():
        if is_const_re.match(name):
            if name in to_module:
                if name in allowed_inconsistent:
                    continue
                if to_module[name] != value:
                    raise Exception("inconsistent value for " + name)
                else:
                    continue
            else:
                to_module[name] = value


_add_compat(globals())