Source code for PyFunceble.converter.adblock_input_line2subject
# pylint:disable=line-too-long
"""
The tool to check the availability or syntax of domain, IP or URL.
::
██████╗ ██╗ ██╗███████╗██╗ ██╗███╗ ██╗ ██████╗███████╗██████╗ ██╗ ███████╗
██╔══██╗╚██╗ ██╔╝██╔════╝██║ ██║████╗ ██║██╔════╝██╔════╝██╔══██╗██║ ██╔════╝
██████╔╝ ╚████╔╝ █████╗ ██║ ██║██╔██╗ ██║██║ █████╗ ██████╔╝██║ █████╗
██╔═══╝ ╚██╔╝ ██╔══╝ ██║ ██║██║╚██╗██║██║ ██╔══╝ ██╔══██╗██║ ██╔══╝
██║ ██║ ██║ ╚██████╔╝██║ ╚████║╚██████╗███████╗██████╔╝███████╗███████╗
╚═╝ ╚═╝ ╚═╝ ╚═════╝ ╚═╝ ╚═══╝ ╚═════╝╚══════╝╚═════╝ ╚══════╝╚══════╝
Provides the conversion of an AdBlock input line into testable subjests.
Author:
Nissar Chababy, @funilrys, contactTATAfunilrysTODTODcom
Special thanks:
https://pyfunceble.github.io/#/special-thanks
Contributors:
https://pyfunceble.github.io/#/contributors
Project link:
https://github.com/funilrys/PyFunceble
Project documentation:
https://pyfunceble.readthedocs.io/en/dev/
Project homepage:
https://pyfunceble.github.io/
License:
::
Copyright 2017, 2018, 2019, 2020 Nissar Chababy
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
from typing import Any, List, Optional, Union
from PyFunceble.checker.syntax.domain import DomainSyntaxChecker
from PyFunceble.checker.syntax.ip import IPSyntaxChecker
from PyFunceble.converter.base import ConverterBase
from PyFunceble.converter.url2netloc import Url2Netloc
from PyFunceble.helpers.list import ListHelper
from PyFunceble.helpers.regex import RegexHelper
[docs]class AdblockInputLine2Subject(ConverterBase):
"""
Converts/Extract the subjects to test from an inputed AdBlock line.
In order to decode the given line, this class and its conversion method
will go though a brunch of decoding method.
"""
OPTION_SEPARATOR: str = ","
OPTIONS_SEPARATOR: str = "$"
_aggressive: bool = False
def __init__(
self, data_to_convert: Optional[Any] = None, aggressive: bool = False
) -> None:
if aggressive is not None:
self.aggressive = aggressive
super().__init__(data_to_convert)
@ConverterBase.data_to_convert.setter
def data_to_convert(self, value: Any) -> None:
"""
Overrites the default behavior.
:raise TypeError:
When the given data to convert is not :py:class:`str`
"""
if not isinstance(value, str):
raise TypeError(f"<value> should be {str}, {type(value)} given.")
# pylint: disable=no-member
super(AdblockInputLine2Subject, self.__class__).data_to_convert.fset(
self, value
)
@property
def aggressive(self) -> bool:
"""
Provides the state of the :code:`_aggressive` attribute.
"""
return self._aggressive
@aggressive.setter
def aggressive(self, value: bool) -> None:
"""
Provides a way to activate/deactivate the aggressive decoding.
:raise TypeError:
When the given data to convert is not :py:class:`str`
"""
if not isinstance(value, bool):
raise TypeError(f"<value> should be {bool}, {type(value)} given.")
self._aggressive = value
[docs] def set_aggressive(self, value: bool) -> "AdblockInputLine2Subject":
"""
Provides a way to activate/deactivate the aggressive decoding.
"""
self.aggressive = value
return self
[docs] @staticmethod
def should_be_ignored(line: str) -> bool:
"""
Checks if we should ignore the given line.
"""
to_ignore = r"(^!|^@@|^\/|^\[|^\.|^-|^_|^\?|^&)"
return RegexHelper(to_ignore).match(line.strip(), return_match=False)
[docs] @classmethod
def extract_base(cls, subject: Union[str, list]) -> str:
"""
Extracts the base of the given element (supposed URL).
As example:
Giving :code:`"hello.world/?is=beautiful"` returns :code:`"hello.world"`
"""
if isinstance(subject, list):
return [cls.extract_base(x) for x in subject]
try:
return Url2Netloc(subject).get_converted()
except ValueError:
return subject
@classmethod
def __format_decoded(
cls, decoded: str, *, result: Optional[List[str]] = None
) -> List[str]:
"""
A recursive method which infinitly filter and format the decoded data
in order to delete uneeded parts.
:param decoded:
The decoded part.
"""
if result is None:
result = []
chars_to_split = ["^", "#", ",", "!", "|"]
for data in decoded:
if not data:
continue
for char_to_split in chars_to_split:
if char_to_split in data:
return cls.__format_decoded(
data.split(char_to_split), result=result
)
data = cls.extract_base(data)
if data and (
DomainSyntaxChecker(data).is_valid() or IPSyntaxChecker(data).is_valid()
):
result.append(data)
return result
def __filter_options(self, options: List[str]) -> Union[bool, List[str]]:
"""
Filters the interessting parts of the given list of options.
:param options:
The extracted options to filter.
.. warning::
Thís method only works if the aggressive method is given.
"""
result = []
regex_domain_in_option = r"domain=(.*)"
for option in options:
try:
domains = RegexHelper(regex_domain_in_option).match(
option, return_match=True, rematch=True, group=0
)[-1]
except TypeError:
continue
result.extend(
[x for x in domains.split("|") if x and not x.startswith("~")]
)
if self.aggressive:
return result
return bool(result)
def __decode_v1(self, line: str) -> List[str]:
"""
Our first decoding version.
The main idea is to filter based on option and a pattern common to all
AdBlock / Ublock format.
:param line:
The line to decode.
"""
result = []
# Get all groups :-)
rematch = RegexHelper(r"^(?:.*\|\|)([^\/\$\^]{1,}).*$").match(
line, return_match=True, group=0, rematch=True
)
if rematch:
if self.OPTIONS_SEPARATOR in line:
# We get the list of options for filtering.
options = line.split(self.OPTIONS_SEPARATOR)[-1].split(
self.OPTION_SEPARATOR
)
# pylint: disable=too-many-boolean-expressions
if (
not options[-1]
or "third-party" in options
or "script" in options
or "popup" in options
or "xmlhttprequest" in options
or "all" in options
or "document" in options
):
result.extend(self.extract_base(rematch))
extra = self.__filter_options(options)
if extra:
if isinstance(extra, list):
extra.extend(self.extract_base(rematch))
result.extend(self.extract_base(extra))
else:
result.extend(self.extract_base(rematch))
else:
result.extend(self.extract_base(rematch))
return result
def __decode_v2(self, line: str) -> List[str]:
"""
Our second decoding version.
The main idea here is that we will match simple records.
:param line:
The line to decode.
"""
result = []
rematch = RegexHelper(r"^\|(.*\..*)\|$").match(
line, return_match=True, rematch=True, group=0
)
if rematch:
result.extend(self.__format_decoded(rematch))
return result
def __decode_v3(self, line: str) -> List[str]:
"""
Our third decoding version.
This one is for more complex formats (and Ublock).
:param line:
The line to decode.
"""
result = []
rematch = RegexHelper(
r"(?:#+(?:[a-z]+?)?\[[a-z]+(?:\^|\*)\=(?:\'|\"))(.*\..*)(?:(?:\'|\")\])"
).match(line, return_match=True, rematch=True, group=0)
if rematch:
result.extend(self.__format_decoded(rematch))
return result
def __decode_v4(self, line: str) -> List[str]:
"""
Our fourth decoding version.
This is is for the one who are surrounded by #.
:param line:
The line to decode.
"""
result = []
rematch = RegexHelper(r"^(.*?)(?:#{2}|#@#)").match(
line, return_match=True, rematch=True, group=0
)
if rematch:
result.extend(self.__format_decoded(rematch))
return result
[docs] def get_converted(self) -> List[str]:
"""
Provides the subjects to test.
"""
result = []
if not self.should_be_ignored(self.data_to_convert.strip()):
result.extend(self.__decode_v1(self.data_to_convert))
result.extend(self.__decode_v2(self.data_to_convert))
result.extend(self.__decode_v3(self.data_to_convert))
result.extend(self.__decode_v4(self.data_to_convert))
return ListHelper(result).remove_duplicates().sort().subject