#!/usr/bin/python3
"""
PlanetFilter - filter for blog aggregators.

PlanetFilter uses a blacklist to filter a blog aggregator feed.
It allows anyone to subscribe to popular blog aggregators without
being overwhelmed by the noise.

Copyright (C) 2010, 2015-2019  Francois Marier <francois@fmarier.org>

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program.  If not, see <http://www.gnu.org/licenses/>.
"""

import argparse
import codecs
import configparser as cp
import gzip
import html
import http.client
import io
import os
import os.path
import sys
import urllib.error
from urllib.parse import quote, urlsplit, urlunsplit
from urllib.request import Request, urlopen
from xml.dom.minidom import Node
import xml.parsers.expat

import defusedxml.minidom as minidom

RDFNS = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'

VERSION = '0.8.2'


def delete_node(node):
    parent = node.parentNode
    parent.removeChild(node)


def delete_rss1_item(item):
    # Delete reference to the item
    rdfabout = item.getAttributeNS(RDFNS, 'about')
    rdfnode = item.parentNode
    channel = rdfnode.getElementsByTagName('channel').item(0)
    rdfseq = channel.getElementsByTagNameNS(RDFNS, 'Seq').item(0)
    rdflist = rdfseq.getElementsByTagNameNS(RDFNS, 'li')
    # pylint: disable=invalid-name
    for li in rdflist:
        if li.getAttributeNS(RDFNS, 'resource') == rdfabout:
            delete_node(li)

    # Delete the item
    delete_node(item)


def is_rss2(xmldocument):
    rsslist = xmldocument.getElementsByTagName('rss')
    if rsslist.length != 1:
        return False
    else:
        # Check the version
        rss = rsslist.item(0)
        if rss.getAttribute('version') != '2.0':
            return False
        else:
            return True


def is_rss1(xmldocument):
    rdflist = xmldocument.getElementsByTagNameNS(RDFNS, 'RDF')
    if rdflist.length != 1:
        return False
    else:
        # Check the namespace/version
        rdf = rdflist.item(0)
        return rdf.getAttribute('xmlns').find('purl.org/rss/1.0') > -1


def is_atom(xmldocument):
    feedlist = xmldocument.getElementsByTagName('feed')
    if feedlist.length != 1:
        return False
    else:
        # Check the namespace/version
        feed = feedlist.item(0)
        return feed.getAttribute('xmlns').find('w3.org/2005/Atom') > -1


def filter_rss2(xmldocument, blacklist):
    # pylint: disable=too-many-branches,too-many-locals,too-many-nested-blocks
    rss = xmldocument.getElementsByTagName('rss').item(0)
    channel = rss.getElementsByTagName('channel').item(0)
    items = channel.getElementsByTagName('item')
    for item in items:
        deleted = False
        titles = item.getElementsByTagName('title')
        if blacklist['authors'] or blacklist['titles']:
            for title in titles:
                textnode = title.firstChild
                if not textnode:
                    continue  # skip empty titles
                if textnode.nodeType in (Node.TEXT_NODE,
                                         Node.CDATA_SECTION_NODE):
                    titlestring = textnode.nodeValue.strip()
                    if blacklist['authors']:
                        for author in blacklist['authors']:
                            if 0 == titlestring.find(author):
                                delete_node(item)
                                deleted = True
                                break
                    if not deleted and blacklist['titles']:
                        for title in blacklist['titles']:
                            if titlestring.find(title) > -1:
                                delete_node(item)
                                deleted = True
                                break
                if deleted:
                    break

        if not deleted and blacklist['urls']:
            links = item.getElementsByTagName('link')
            for link in links:
                textnode = link.firstChild
                if textnode and textnode.nodeType in (Node.TEXT_NODE,
                                                      Node.CDATA_SECTION_NODE):
                    linkstring = textnode.nodeValue.strip()
                    for url in blacklist['urls']:
                        if 0 == linkstring.find(url):
                            delete_node(item)
                            deleted = True
                            break
                if deleted:
                    break

    return True


def filter_atom(xmldocument, blacklist):
    # pylint: disable=too-many-branches,too-many-locals,too-many-nested-blocks
    feed = xmldocument.getElementsByTagName('feed').item(0)
    entries = feed.getElementsByTagName('entry')
    for entry in entries:
        deleted = False
        if blacklist['authors']:
            authors = entry.getElementsByTagName('author')
            for author in authors:
                name = author.getElementsByTagName('name').item(0)
                textnode = name.firstChild
                if textnode and textnode.nodeType in (Node.TEXT_NODE,
                                                      Node.CDATA_SECTION_NODE):
                    authorstring = textnode.nodeValue.strip()
                    for author in blacklist['authors']:
                        if 0 == authorstring.find(author):
                            delete_node(entry)
                            deleted = True
                            break
                if deleted:
                    break

        if not deleted and blacklist['titles']:
            titles = entry.getElementsByTagName('title')
            for title in titles:
                textnode = title.firstChild
                if not textnode:
                    continue  # skip empty titles
                if textnode.nodeType in (Node.TEXT_NODE,
                                         Node.CDATA_SECTION_NODE):
                    titlestring = textnode.nodeValue.strip()
                    for title in blacklist['titles']:
                        if titlestring.find(title) > -1:
                            delete_node(entry)
                            deleted = True
                            break
                if deleted:
                    break

        if not deleted and blacklist['urls']:
            links = entry.getElementsByTagName('link')
            for link in links:
                if link.getAttribute('rel') != 'alternate':
                    continue
                linkstring = link.getAttribute('href')
                for url in blacklist['urls']:
                    if 0 == linkstring.find(url):
                        delete_node(entry)
                        deleted = True
                        break
                if deleted:
                    break

    return True


def filter_rss1(xmldocument, blacklist):
    # pylint: disable=too-many-branches,too-many-nested-blocks
    rdf = xmldocument.getElementsByTagNameNS(RDFNS, 'RDF').item(0)
    items = rdf.getElementsByTagName('item')
    for item in items:
        deleted = False
        titles = item.getElementsByTagName('title')
        if blacklist['authors'] or blacklist['titles']:
            for title in titles:
                textnode = title.firstChild
                if not textnode:
                    continue  # skip empty titles
                if textnode.nodeType in (Node.TEXT_NODE,
                                         Node.CDATA_SECTION_NODE):
                    titlestring = textnode.nodeValue.strip()
                    if blacklist['authors']:
                        for author in blacklist['authors']:
                            if 0 == titlestring.find(author):
                                delete_rss1_item(item)
                                deleted = True
                                break
                    if not deleted and blacklist['titles']:
                        for title in blacklist['titles']:
                            if titlestring.find(title) > -1:
                                delete_rss1_item(item)
                                deleted = True
                                break
                if deleted:
                    break

        if not deleted and blacklist['urls']:
            links = item.getElementsByTagName('link')
            for link in links:
                textnode = link.firstChild
                if textnode and textnode.nodeType in (Node.TEXT_NODE,
                                                      Node.CDATA_SECTION_NODE):
                    linkstring = textnode.nodeValue.strip()
                    for url in blacklist['urls']:
                        if 0 == linkstring.find(url):
                            delete_rss1_item(item)
                            deleted = True
                            break
                if deleted:
                    break

    return True


def filter_feed(xmldocument, blacklist):
    if is_rss2(xmldocument):
        return filter_rss2(xmldocument, blacklist)
    elif is_rss1(xmldocument):
        return filter_rss1(xmldocument, blacklist)
    elif is_atom(xmldocument):
        return filter_atom(xmldocument, blacklist)
    else:
        print('Unsupported feed type', file=sys.stderr)
        return False


def read_config_url(config, configfile):
    try:
        url = config.get('feed', 'url')
    except cp.NoSectionError:
        print("Error: '%s' doesn't contain a [feed] section" % configfile,
              file=sys.stderr)
        return None
    except cp.NoOptionError:
        print("Error: '%s' doesn't contain a feed URL" % configfile,
              file=sys.stderr)
        return None
    if not url:
        print("Error: '%s' doesn't contain a feed URL" % configfile,
              file=sys.stderr)
        return None

    # URL-escape the path (bug 1485854)
    parts = urlsplit(url)
    parts = parts._replace(path=quote(parts.path))
    url = urlunsplit(parts)

    return url


def read_config_blacklist(config, configfile):
    blacklist = {'authors': None, 'titles': None, 'urls': None}

    try:
        # pylint: disable=no-member
        blacklist['authors'] = config.get('blacklist', 'authors').split("\n")
    except cp.NoSectionError:
        print("Warning: '%s' doesn't contain a [blacklist] section" %
              configfile, file=sys.stderr)
    except cp.NoOptionError:
        pass  # let's not warn about missing authors blacklist

    try:
        # pylint: disable=no-member
        blacklist['titles'] = config.get('blacklist', 'titles').split("\n")
    except cp.NoSectionError:
        pass  # we already warned about that
    except cp.NoOptionError:
        pass  # let's not warn about missing titles blacklist

    try:
        # pylint: disable=no-member
        blacklist['urls'] = config.get('blacklist', 'urls').split("\n")
    except cp.NoSectionError:
        pass  # we already warned about that
    except cp.NoOptionError:
        pass  # let's not warn about missing urls blacklist

    # Remove empty elements from the blacklist
    for field in ['authors', 'titles', 'urls']:
        if blacklist[field]:
            for i in reversed(range(len(blacklist[field]))):
                # pylint: disable=unsubscriptable-object
                if not blacklist[field][i]:
                    del blacklist[field][i]
    return blacklist


def download_feed(url):
    # pylint: disable=too-many-return-statements
    request = Request(url, headers={
        'Accept-encoding': 'gzip', 'User-Agent':
        'Mozilla/5.0 (Windows NT 6.1; rv:52.0) Gecko/20100101 Firefox/52.0'
    })
    try:
        response = urlopen(request)
    except urllib.error.HTTPError as err:
        print("Error: '%s' cannot be fetched (HTTPError): %s" % (url, err),
              file=sys.stderr)
        return None
    except urllib.error.URLError as err:
        print("Error: '%s' cannot be fetched (URLError): %s" % (url, err),
              file=sys.stderr)
        return None
    except TimeoutError as err:
        print("Error: '%s' cannot be fetched (TimeoutError): %s" % (url, err),
              file=sys.stderr)
        return None
    except ConnectionResetError as err:
        print("Error: '%s' cannot be fetched (ConnectionResetError): %s"
              % (url, err), file=sys.stderr)
        return None
    except http.client.BadStatusLine as err:
        print("Error: '%s' cannot be fetched (BadStatusLine): %s" % (url, err),
              file=sys.stderr)
        return None
    except OSError as err:
        print("Error: '%s' cannot be fetched (OSError): %s"
              % (url, err), file=sys.stderr)
        return None

    if response.info().get('Content-Encoding') == 'gzip':
        # print("Note: compressed response for '%s'" % url, file=sys.stderr)
        try:
            buf = io.BytesIO(response.read())
        except http.client.IncompleteRead:
            print("Error: can't decompress response (IncompleteRead)",
                  file=sys.stderr)
            return None
        except ConnectionResetError as err:
            print("Error: can't decompress response (ConnectionResetError): %s"
                  % err, file=sys.stderr)
            return None
        response = gzip.GzipFile(fileobj=buf)

    contents = None
    try:
        contents = response.read()
    except http.client.IncompleteRead as err:
        print("Warning: '%s' cannot be fully read: %s" % (url, err),
              file=sys.stderr)
    if not contents:
        print("Error: '%s' could not be downloaded" % url, file=sys.stderr)
        return None

    return contents.strip()


def remove_html_entities(contents):
    try:
        ret = contents.decode('utf-8')
    except UnicodeDecodeError as err:
        print("Warning: not a valid UTF-8 document (%s), trying ISO-8859-1"
              % err, file=sys.stderr)
        ret = contents.decode('iso-8859-1')

    # Prevent some entities from being replaced
    ret = ret.replace('&amp;', 'MAGICTOKEN-AMPERSAND-MAGICTOKEN')
    ret = ret.replace('&lt;', 'MAGICTOKEN-LESSTHAN-MAGICTOKEN')
    ret = ret.replace('&gt;', 'MAGICTOKEN-GREATERTHAN-MAGICTOKEN')

    # Built-in Python 3.4 function
    ret = html.unescape(ret)

    # Look for any unescaped ampersands
    ret = ret.replace('&', '&amp;')

    # Restore the required entities
    ret = ret.replace('MAGICTOKEN-AMPERSAND-MAGICTOKEN', '&amp;')
    ret = ret.replace('MAGICTOKEN-LESSTHAN-MAGICTOKEN', '&lt;')
    ret = ret.replace('MAGICTOKEN-GREATERTHAN-MAGICTOKEN', '&gt;')

    return ret


def parse_feed(contents, url):
    document = None

    try:
        document = minidom.parseString(contents)
    except xml.parsers.expat.ExpatError as err:
        print("Warning: '%s' is not a valid feed (%s)" % (url, err),
              file=sys.stderr)
        document = None

    if document:
        return document  # early exit for valid feeds

    # Try fixing HTML entities
    noentities = remove_html_entities(contents)

    try:
        document = minidom.parseString(noentities)
    except xml.parsers.expat.ExpatError as err:
        print("Error: '%s' is not a valid feed, even with HTML entities "
              "removed (%s)" % (url, err), file=sys.stderr)
        document = None

    return document


def process_config(configfile, outfile, overwrite):
    """Read a config file, fetch its feed and filter it."""
    if outfile and os.path.isfile(outfile) and not overwrite:
        print("Error: '%s' already exists, use --force to overwrite" % outfile,
              file=sys.stderr)
        return False

    config = cp.ConfigParser()
    with codecs.open(configfile, 'r', 'utf-8') as configfh:
        config.read_file(configfh)

    url = read_config_url(config, configfile)
    if not url:
        return False  # fatal error
    blacklist = read_config_blacklist(config, configfile)

    contents = download_feed(url)
    if not contents:
        if outfile and os.path.isfile(outfile):
            # leave the previously filtered feed in place
            pass
        return True  # non-fatal error

    document = parse_feed(contents, url)
    if not document:
        if outfile and os.path.isfile(outfile):
            try:
                with codecs.open(outfile, 'w', 'utf-8') as outfh:
                    outfh.write('')  # clear any previous feed
            except PermissionError:
                print("Error: not enough permissions to write to '%s'"
                      % outfile, file=sys.stderr)
        return False

    filter_feed(document, blacklist)

    if outfile:
        try:
            with codecs.open(outfile, 'w', 'utf-8') as outfh:
                outfh.write(document.toxml())
        except PermissionError:
            print("Error: not enough permissions to write to '%s'" % outfile,
                  file=sys.stderr)
            return False
    else:
        print(document.toxml())
    return True


def main():
    parser = argparse.ArgumentParser(
        description='Blacklist-based filter for blog aggregators.')
    parser.add_argument('configfile', type=str,
                        help='the config file to parse')
    parser.add_argument('-o', '--output', metavar='file',
                        required=False, type=str,
                        help='the output filename (default: <STDOUT>)')
    parser.add_argument('-f', '--force', dest='force', action='store_true',
                        help='overwrite the destination file')
    parser.add_argument('-V', '--version', action='version',
                        version='planetfilter %s' % VERSION)
    args = parser.parse_args()

    if not os.path.isfile(args.configfile):
        print("Error: '%s' not found" % args.configfile, file=sys.stderr)
        return False
    return process_config(args.configfile, args.output, args.force)


if main():
    exit(0)
else:
    exit(1)
