tests/test_phystokens.py

# Licensed under the Apache License: http://www.apache.org/licenses/LICENSE-2.0
# For details: https://github.com/nedbat/coveragepy/blob/master/NOTICE.txt

"""Tests for coverage.py's improved tokenizer."""

from __future__ import annotations

import os.path
import re
import sys
import textwrap
import warnings

import pytest

from coverage import env
from coverage.phystokens import source_token_lines, source_encoding
from coverage.python import get_python_source

from tests.coveragetest import CoverageTest, TESTS_DIR


# A simple program and its token stream.
SIMPLE = """\
# yay!
def foo():
  say('two = %d' % 2)
"""

SIMPLE_TOKENS = [
    [('com', "# yay!")],
    [('key', 'def'), ('ws', ' '), ('nam', 'foo'), ('op', '('), ('op', ')'), ('op', ':')],
    [('ws', '  '), ('nam', 'say'), ('op', '('),
        ('str', "'two = %d'"), ('ws', ' '), ('op', '%'),
        ('ws', ' '), ('num', '2'), ('op', ')')],
]

# Mixed-white-space program, and its token stream.
MIXED_WS = """\
def hello():
        a="Hello world!"
\tb="indented"
"""

MIXED_WS_TOKENS = [
    [('key', 'def'), ('ws', ' '), ('nam', 'hello'), ('op', '('), ('op', ')'), ('op', ':')],
    [('ws', '        '), ('nam', 'a'), ('op', '='), ('str', '"Hello world!"')],
    [('ws', '        '), ('nam', 'b'), ('op', '='), ('str', '"indented"')],
]

# https://github.com/nedbat/coveragepy/issues/822
BUG_822 = """\
print( "Message 1" )
array = [ 1,2,3,4,       # 4 numbers \\
          5,6,7 ]        # 3 numbers
print( "Message 2" )
"""

class PhysTokensTest(CoverageTest):
    """Tests for coverage.py's improved tokenizer."""

    run_in_temp_dir = False

    def check_tokenization(self, source: str) -> None:
        """Tokenize `source`, then put it back together, should be the same."""
        tokenized = ""
        for line in source_token_lines(source):
            text = "".join(t for _, t in line)
            tokenized += text + "\n"
        # source_token_lines doesn't preserve trailing spaces, so trim all that
        # before comparing.
        source = source.replace('\r\n', '\n')
        source = re.sub(r"(?m)[ \t]+$", "", source)
        tokenized = re.sub(r"(?m)[ \t]+$", "", tokenized)
        assert source == tokenized

    def check_file_tokenization(self, fname: str) -> None:
        """Use the contents of `fname` for `check_tokenization`."""
        self.check_tokenization(get_python_source(fname))

    def test_simple(self) -> None:
        assert list(source_token_lines(SIMPLE)) == SIMPLE_TOKENS
        self.check_tokenization(SIMPLE)

    def test_missing_final_newline(self) -> None:
        # We can tokenize source that is missing the final newline.
        assert list(source_token_lines(SIMPLE.rstrip())) == SIMPLE_TOKENS

    def test_tab_indentation(self) -> None:
        # Mixed tabs and spaces...
        assert list(source_token_lines(MIXED_WS)) == MIXED_WS_TOKENS

    def test_bug_822(self) -> None:
        self.check_tokenization(BUG_822)

    def test_tokenize_real_file(self) -> None:
        # Check the tokenization of a real file (large, btw).
        real_file = os.path.join(TESTS_DIR, "test_coverage.py")
        self.check_file_tokenization(real_file)

    def test_1828(self) -> None:
        # https://github.com/nedbat/coveragepy/pull/1828
        tokens = list(source_token_lines(textwrap.dedent("""
            x = \
                1
            a = ["aaa",\\
                 "bbb \\
                 ccc"]
            """)))
        assert tokens == [
            [],
            [('nam', 'x'), ('ws', ' '), ('op', '='), ('ws', '                 '), ('num', '1')],
            [('nam', 'a'), ('ws', ' '), ('op', '='), ('ws', ' '),
                ('op', '['), ('str', '"aaa"'), ('op', ','), ('xx', '\\')],
            [('ws', '     '), ('str', '"bbb \\')],
            [('str', '     ccc"'), ('op', ']')],
        ]

    @pytest.mark.parametrize("fname", [
        "stress_phystoken.tok",
        "stress_phystoken_dos.tok",
    ])
    def test_stress(self, fname: str) -> None:
        # Check the tokenization of the stress-test files.
        # And check that those files haven't been incorrectly "fixed".
        with warnings.catch_warnings():
            warnings.filterwarnings("ignore", message=r".*invalid escape sequence")

            stress = os.path.join(TESTS_DIR, fname)
            self.check_file_tokenization(stress)
            with open(stress) as fstress:
                assert re.search(r"(?m) $", fstress.read()), f"{stress} needs a trailing space."


@pytest.mark.skipif(not env.PYBEHAVIOR.soft_keywords, reason="Soft keywords are new in Python 3.10")
class SoftKeywordTest(CoverageTest):
    """Tests the tokenizer handling soft keywords."""

    run_in_temp_dir = False

    def test_soft_keywords_match_case(self) -> None:
        source = textwrap.dedent("""\
            match re.match(something):
                case ["what"]:
                    match = case("hello")
                case [_]:
                    match("hello")
                    match another.thing:
                        case 1:
                            pass

            class case(): pass
            def match():
                global case
            """)
        tokens = list(source_token_lines(source))
        print(tokens)
        assert tokens[0][0] == ("key", "match")
        assert tokens[0][4] == ("nam", "match")
        assert tokens[1][1] == ("key", "case")
        assert tokens[2][1] == ("nam", "match")
        assert tokens[2][5] == ("nam", "case")
        assert tokens[3][1] == ("key", "case")
        assert tokens[4][1] == ("nam", "match")
        assert tokens[5][1] == ("key", "match")
        assert tokens[6][1] == ("key", "case")
        assert tokens[9][2] == ("nam", "case")
        assert tokens[10][2] == ("nam", "match")
        assert tokens[11][3] == ("nam", "case")

    @pytest.mark.skipif(sys.version_info < (3, 12), reason="type is a soft keyword in 3.12")
    def test_soft_keyword_type(self) -> None:
        source = textwrap.dedent("""\
            type Point = tuple[float, float]
            type(int)
            """)
        tokens = list(source_token_lines(source))
        assert tokens[0][0] == ("key", "type")
        assert tokens[1][0] == ("nam", "type")


# The default source file encoding.
DEF_ENCODING = "utf-8"


ENCODING_DECLARATION_SOURCES = [
    # Various forms from http://www.python.org/dev/peps/pep-0263/
    (1, b"# coding=cp850\n\n", "cp850"),
    (1, b"# coding=latin-1\n", "iso-8859-1"),
    (1, b"# coding=iso-latin-1\n", "iso-8859-1"),
    (1, b"#!/usr/bin/python\n# -*- coding: cp850 -*-\n", "cp850"),
    (1, b"#!/usr/bin/python\n# vim: set fileencoding=cp850:\n", "cp850"),
    (1, b"# This Python file uses this encoding: cp850\n", "cp850"),
    (1, b"# This file uses a different encoding:\n# coding: cp850\n", "cp850"),
    (1, b"\n# coding=cp850\n\n", "cp850"),
    (2, b"# -*-  coding:cp850 -*-\n# vim: fileencoding=cp850\n", "cp850"),
]

class SourceEncodingTest(CoverageTest):
    """Tests of source_encoding() for detecting encodings."""

    run_in_temp_dir = False

    def test_detect_source_encoding(self) -> None:
        for _, source, expected in ENCODING_DECLARATION_SOURCES:
            assert source_encoding(source) == expected, f"Wrong encoding in {source!r}"

    def test_detect_source_encoding_not_in_comment(self) -> None:
        # Should not detect anything here
        source = b'def parse(src, encoding=None):\n    pass'
        assert source_encoding(source) == DEF_ENCODING

    def test_dont_detect_source_encoding_on_third_line(self) -> None:
        # A coding declaration doesn't count on the third line.
        source = b"\n\n# coding=cp850\n\n"
        assert source_encoding(source) == DEF_ENCODING

    def test_detect_source_encoding_of_empty_file(self) -> None:
        # An important edge case.
        assert source_encoding(b"") == DEF_ENCODING

    def test_bom(self) -> None:
        # A BOM means utf-8.
        source = b"\xEF\xBB\xBFtext = 'hello'\n"
        assert source_encoding(source) == 'utf-8-sig'

    def test_bom_with_encoding(self) -> None:
        source = b"\xEF\xBB\xBF# coding: utf-8\ntext = 'hello'\n"
        assert source_encoding(source) == 'utf-8-sig'

    def test_bom_is_wrong(self) -> None:
        # A BOM with an explicit non-utf8 encoding is an error.
        source = b"\xEF\xBB\xBF# coding: cp850\n"
        with pytest.raises(SyntaxError, match="encoding problem: utf-8"):
            source_encoding(source)

    def test_unknown_encoding(self) -> None:
        source = b"# coding: klingon\n"
        with pytest.raises(SyntaxError, match="unknown encoding: klingon"):
            source_encoding(source)