Source code for PyFunceble.converter.adblock

"""
The tool to check the availability or syntax of domain, IP or URL.

::


    ██████╗ ██╗   ██╗███████╗██╗   ██╗███╗   ██╗ ██████╗███████╗██████╗ ██╗     ███████╗
    ██╔══██╗╚██╗ ██╔╝██╔════╝██║   ██║████╗  ██║██╔════╝██╔════╝██╔══██╗██║     ██╔════╝
    ██████╔╝ ╚████╔╝ █████╗  ██║   ██║██╔██╗ ██║██║     █████╗  ██████╔╝██║     █████╗
    ██╔═══╝   ╚██╔╝  ██╔══╝  ██║   ██║██║╚██╗██║██║     ██╔══╝  ██╔══██╗██║     ██╔══╝
    ██║        ██║   ██║     ╚██████╔╝██║ ╚████║╚██████╗███████╗██████╔╝███████╗███████╗
    ╚═╝        ╚═╝   ╚═╝      ╚═════╝ ╚═╝  ╚═══╝ ╚═════╝╚══════╝╚═════╝ ╚══════╝╚══════╝

Provides an adblock decoding interface.

Author:
    Nissar Chababy, @funilrys, contactTATAfunilrysTODTODcom

Special thanks:
    https://pyfunceble.github.io/special-thanks.html

Contributors:
    https://pyfunceble.github.io/contributors.html

Project link:
    https://github.com/funilrys/PyFunceble

Project documentation:
    https://pyfunceble.readthedocs.io/en/master/

Project homepage:
    https://pyfunceble.github.io/

License:
::


    Copyright 2017, 2018, 2019, 2020 Nissar Chababy

    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.
    You may obtain a copy of the License at

        http://www.apache.org/licenses/LICENSE-2.0

    Unless required by applicable law or agreed to in writing, software
    distributed under the License is distributed on an "AS IS" BASIS,
    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    See the License for the specific language governing permissions and
    limitations under the License.
"""
from PyFunceble import helpers
from PyFunceble.check import Check
from PyFunceble.exceptions import WrongParameterType

from .base import ConverterBase


[docs]class AdBlock(ConverterBase): """ Converts an adblock filter line to a list os subject to test. """ options_separator = "$" option_separator = "," def __init__(self, data_to_convert, aggressive=False): if not isinstance(data_to_convert, (str, list)): raise WrongParameterType( f"<data_to_convert> should be {str} or {list}, {type(data_to_convert)} given." ) super().__init__(data_to_convert) self.aggressive = aggressive
[docs] @classmethod def ignore_it(cls, subject): """ Checks if we have to ignore the given subject. :param str subject: The subject ot work with. :return: The result of the check. :rtype: bool """ # We set the list of regex to match to be # considered as ignored. # # Note: In a more aggressive way, r"(\$|,)(image)" may be added. to_ignore = [r"(^!|^@@|^\/|^\[|^\.|^-|^_|^\?|^&)"] for element in to_ignore: # We loop through the list of regex. if helpers.Regex(element).match(subject.strip(), return_match=False): # The currently read line match the currently read # regex. # We return true, it has to be ignored. return True # Wer return False, it does not has to be ignored. return False
[docs] def remove_ignored(self, subject): """ Removes the ignored element from the given list of subject. """ if isinstance(subject, str): if self.ignore_it(subject): return [] return [subject.strip()] return [x.strip() for x in subject if not self.ignore_it(x)]
[docs] def extract_from_options(self, options): """ Exctracts the relevant data from the list of options. :param list options: The list of options of a rule. :return: The list of domains. :rtype: list """ result = [] # We initiate the regex which will be used to extract the domain listed # under the option domain= regex_domain_option = r"domain=(.*)" for option in options: # We loop through the list of option. try: # We try to extract the list of domains from the currently read # option. domains = helpers.Regex(regex_domain_option).match( option, return_match=True, rematch=True, group=0 )[-1] if domains: # We could extract something. if self.aggressive: # pragma: no cover result.extend( [ x for x in domains.split("|") if x and not x.startswith("~") ] ) else: return True except TypeError: pass # We return the result. return result
[docs] def extract_base(self, subject): """ Extracts the base of the given element. As an example: given :code:`"hello.world/?is=beautiful"` returns :code:`"hello.world"` :param subject: The subject to work with. :type element: str|list """ if isinstance(subject, list): return [self.extract_base(x) for x in subject] base = Check(subject).is_url(return_base=True) if base: return base if "/" in subject: return subject.split("/")[0] return subject
[docs] def format_decoded(self, decoded, result=None): # pragma: no cover """ Formats the extracted adblock line in order to be compatible with what the system understand. :param str decoded: The decoded data to work with. :param list result: A list of the result of this method. :return: The list of domains or IP compatible with the system. :rtype: list """ if result is None: result = [] for data in decoded: if data: if "^" in data: # There is an accent in the currently read line. # We recall this method but with the current result state # and splited data. return self.format_decoded(data.split("^"), result) if "#" in data: # There is a dash in the currently read line. # We recall this method but with the current result state # and splited data. return self.format_decoded(data.split("#"), result) if "," in data: # There is a comma in the currently read line. # We recall this method but with the current result state # and splited data. return self.format_decoded(data.split(","), result) if "!" in data: # There is an exclamation mark in the currently read line. # We recall this method but with the current result state # and splited data. return self.format_decoded(data.split("!"), result) if "|" in data: # There is a vertival bar in the currently read line. # We recall this method but with the current result state # and splited data. return self.format_decoded(data.split("|"), result) if data: # The currently read line is not empty. data = self.extract_base(data) # We create an instance of the checker. checker = Check(data) if data and (checker.is_domain() or checker.is_ip()): # The extraced base is not empty. # and # * The currently read line is a valid domain. # or # * The currently read line is a valid IP. # We append the currently read line to the result. result.append(data) elif data: # * The currently read line is not a valid domain. # or # * The currently read line is not a valid IP. # We try to get the url base. url_base = checker.is_url(return_base=True) if url_base: # The url_base is not empty or equal to False or None. # We append the url base to the result. result.append(url_base) return result
def __decode_v1(self, data): """ Decodes the v1. :param str data: A string to decode. :rtype: list """ result = [] rematch = helpers.Regex(r"^(?:.*\|\|)([^\/\$\^]{1,}).*$").match( data, return_match=True, group=0, rematch=True ) if rematch: if self.options_separator in data: options = data.split(self.options_separator)[-1].split( self.option_separator ) # pylint: disable=too-many-boolean-expressions if ( not options[-1] or "third-party" in options or "script" in options or "popup" in options or "xmlhttprequest" in options or "all" in options or "document" in options ): result.extend(self.extract_base(rematch)) extra = self.extract_from_options(options) if extra: if isinstance(extra, list): # pragma: no cover extra.extend(self.extract_base(rematch)) result.extend(self.extract_base(extra)) else: result.extend(self.extract_base(rematch)) else: result.extend(self.extract_base(rematch)) return result def __decode_v2(self, data): """ Decodes the v2. :param str data: A string to decode. :rtype: list """ result = [] rematch = helpers.Regex(r"^\|(.*\..*)\|$").match( data, return_match=True, group=0, rematch=True ) if rematch: result.extend(self.format_decoded(rematch)) return result def __decode_v3(self, data): """ Decodes the v3. :param str data: A string to decode. :rtype: list """ result = [] rematch = helpers.Regex( r"(?:#+(?:[a-z]+?)?\[[a-z]+(?:\^|\*)\=(?:\'|\"))(.*\..*)(?:(?:\'|\")\])" ).match(data, return_match=True, group=0, rematch=True) if rematch: result.extend(self.format_decoded(rematch)) return result def __decode_v4(self, data): """ Decodes the v4. :param str data: A string to decode. :rtype: list """ result = [] rematch = helpers.Regex(r"^(.*?)(?:#{2}|#@#)").match( data, return_match=True, group=0, rematch=True ) if rematch: result.extend(self.format_decoded(rematch)) return result
[docs] def get_converted(self): """ Converts and return the result of the conversion. :rtype: list """ result = [] for data in self.remove_ignored(self.data_to_convert): result.extend(self.__decode_v1(data)) result.extend(self.__decode_v2(data)) result.extend(self.__decode_v3(data)) result.extend(self.__decode_v4(data)) return helpers.List(result).format()