Source code for PyFunceble.converter.adblock_input_line2subject

# pylint:disable=line-too-long
"""
The tool to check the availability or syntax of domain, IP or URL.

::


    ██████╗ ██╗   ██╗███████╗██╗   ██╗███╗   ██╗ ██████╗███████╗██████╗ ██╗     ███████╗
    ██╔══██╗╚██╗ ██╔╝██╔════╝██║   ██║████╗  ██║██╔════╝██╔════╝██╔══██╗██║     ██╔════╝
    ██████╔╝ ╚████╔╝ █████╗  ██║   ██║██╔██╗ ██║██║     █████╗  ██████╔╝██║     █████╗
    ██╔═══╝   ╚██╔╝  ██╔══╝  ██║   ██║██║╚██╗██║██║     ██╔══╝  ██╔══██╗██║     ██╔══╝
    ██║        ██║   ██║     ╚██████╔╝██║ ╚████║╚██████╗███████╗██████╔╝███████╗███████╗
    ╚═╝        ╚═╝   ╚═╝      ╚═════╝ ╚═╝  ╚═══╝ ╚═════╝╚══════╝╚═════╝ ╚══════╝╚══════╝

Provides the conversion of an AdBlock input line into testable subjests.

Author:
    Nissar Chababy, @funilrys, contactTATAfunilrysTODTODcom

Special thanks:
    https://pyfunceble.github.io/#/special-thanks

Contributors:
    https://pyfunceble.github.io/#/contributors

Project link:
    https://github.com/funilrys/PyFunceble

Project documentation:
    https://pyfunceble.readthedocs.io/en/dev/

Project homepage:
    https://pyfunceble.github.io/

License:
::


    Copyright 2017, 2018, 2019, 2020 Nissar Chababy

    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.
    You may obtain a copy of the License at

        http://www.apache.org/licenses/LICENSE-2.0

    Unless required by applicable law or agreed to in writing, software
    distributed under the License is distributed on an "AS IS" BASIS,
    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    See the License for the specific language governing permissions and
    limitations under the License.
"""

from typing import Any, List, Optional, Union

from PyFunceble.checker.syntax.domain import DomainSyntaxChecker
from PyFunceble.checker.syntax.ip import IPSyntaxChecker
from PyFunceble.converter.base import ConverterBase
from PyFunceble.converter.url2netloc import Url2Netloc
from PyFunceble.helpers.list import ListHelper
from PyFunceble.helpers.regex import RegexHelper


[docs]class AdblockInputLine2Subject(ConverterBase): """ Converts/Extract the subjects to test from an inputed AdBlock line. In order to decode the given line, this class and its conversion method will go though a brunch of decoding method. """ OPTION_SEPARATOR: str = "," OPTIONS_SEPARATOR: str = "$" _aggressive: bool = False def __init__( self, data_to_convert: Optional[Any] = None, aggressive: bool = False ) -> None: if aggressive is not None: self.aggressive = aggressive super().__init__(data_to_convert) @ConverterBase.data_to_convert.setter def data_to_convert(self, value: Any) -> None: """ Overrites the default behavior. :raise TypeError: When the given data to convert is not :py:class:`str` """ if not isinstance(value, str): raise TypeError(f"<value> should be {str}, {type(value)} given.") # pylint: disable=no-member super(AdblockInputLine2Subject, self.__class__).data_to_convert.fset( self, value ) @property def aggressive(self) -> bool: """ Provides the state of the :code:`_aggressive` attribute. """ return self._aggressive @aggressive.setter def aggressive(self, value: bool) -> None: """ Provides a way to activate/deactivate the aggressive decoding. :raise TypeError: When the given data to convert is not :py:class:`str` """ if not isinstance(value, bool): raise TypeError(f"<value> should be {bool}, {type(value)} given.") self._aggressive = value
[docs] def set_aggressive(self, value: bool) -> "AdblockInputLine2Subject": """ Provides a way to activate/deactivate the aggressive decoding. """ self.aggressive = value return self
[docs] @staticmethod def should_be_ignored(line: str) -> bool: """ Checks if we should ignore the given line. """ to_ignore = r"(^!|^@@|^\/|^\[|^\.|^-|^_|^\?|^&)" return RegexHelper(to_ignore).match(line.strip(), return_match=False)
[docs] @classmethod def extract_base(cls, subject: Union[str, list]) -> str: """ Extracts the base of the given element (supposed URL). As example: Giving :code:`"hello.world/?is=beautiful"` returns :code:`"hello.world"` """ if isinstance(subject, list): return [cls.extract_base(x) for x in subject] try: return Url2Netloc(subject).get_converted() except ValueError: return subject
@classmethod def __format_decoded( cls, decoded: str, *, result: Optional[List[str]] = None ) -> List[str]: """ A recursive method which infinitly filter and format the decoded data in order to delete uneeded parts. :param decoded: The decoded part. """ if result is None: result = [] chars_to_split = ["^", "#", ",", "!", "|"] for data in decoded: if not data: continue for char_to_split in chars_to_split: if char_to_split in data: return cls.__format_decoded( data.split(char_to_split), result=result ) data = cls.extract_base(data) if data and ( DomainSyntaxChecker(data).is_valid() or IPSyntaxChecker(data).is_valid() ): result.append(data) return result def __filter_options(self, options: List[str]) -> Union[bool, List[str]]: """ Filters the interessting parts of the given list of options. :param options: The extracted options to filter. .. warning:: Thís method only works if the aggressive method is given. """ result = [] regex_domain_in_option = r"domain=(.*)" for option in options: try: domains = RegexHelper(regex_domain_in_option).match( option, return_match=True, rematch=True, group=0 )[-1] except TypeError: continue result.extend( [x for x in domains.split("|") if x and not x.startswith("~")] ) if self.aggressive: return result return bool(result) def __decode_v1(self, line: str) -> List[str]: """ Our first decoding version. The main idea is to filter based on option and a pattern common to all AdBlock / Ublock format. :param line: The line to decode. """ result = [] # Get all groups :-) rematch = RegexHelper(r"^(?:.*\|\|)([^\/\$\^]{1,}).*$").match( line, return_match=True, group=0, rematch=True ) if rematch: if self.OPTIONS_SEPARATOR in line: # We get the list of options for filtering. options = line.split(self.OPTIONS_SEPARATOR)[-1].split( self.OPTION_SEPARATOR ) # pylint: disable=too-many-boolean-expressions if ( not options[-1] or "third-party" in options or "script" in options or "popup" in options or "xmlhttprequest" in options or "all" in options or "document" in options ): result.extend(self.extract_base(rematch)) extra = self.__filter_options(options) if extra: if isinstance(extra, list): extra.extend(self.extract_base(rematch)) result.extend(self.extract_base(extra)) else: result.extend(self.extract_base(rematch)) else: result.extend(self.extract_base(rematch)) return result def __decode_v2(self, line: str) -> List[str]: """ Our second decoding version. The main idea here is that we will match simple records. :param line: The line to decode. """ result = [] rematch = RegexHelper(r"^\|(.*\..*)\|$").match( line, return_match=True, rematch=True, group=0 ) if rematch: result.extend(self.__format_decoded(rematch)) return result def __decode_v3(self, line: str) -> List[str]: """ Our third decoding version. This one is for more complex formats (and Ublock). :param line: The line to decode. """ result = [] rematch = RegexHelper( r"(?:#+(?:[a-z]+?)?\[[a-z]+(?:\^|\*)\=(?:\'|\"))(.*\..*)(?:(?:\'|\")\])" ).match(line, return_match=True, rematch=True, group=0) if rematch: result.extend(self.__format_decoded(rematch)) return result def __decode_v4(self, line: str) -> List[str]: """ Our fourth decoding version. This is is for the one who are surrounded by #. :param line: The line to decode. """ result = [] rematch = RegexHelper(r"^(.*?)(?:#{2}|#@#)").match( line, return_match=True, rematch=True, group=0 ) if rematch: result.extend(self.__format_decoded(rematch)) return result
[docs] def get_converted(self) -> List[str]: """ Provides the subjects to test. """ result = [] if not self.should_be_ignored(self.data_to_convert.strip()): result.extend(self.__decode_v1(self.data_to_convert)) result.extend(self.__decode_v2(self.data_to_convert)) result.extend(self.__decode_v3(self.data_to_convert)) result.extend(self.__decode_v4(self.data_to_convert)) return ListHelper(result).remove_duplicates().sort().subject