This file is indexed.

/usr/share/pyshared/weboob/tools/capabilities/messages/genericArticle.py is in python-weboob-core 0.g-1.

This file is owned by root:root, with mode 0o644.

The actual contents of the file can be viewed below.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
# -*- coding: utf-8 -*-

# Copyright(C) 2011  Julien Hebert
#
# This file is part of weboob.
#
# weboob is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# weboob is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see <http://www.gnu.org/licenses/>.

from weboob.tools.browser import BasePage
from weboob.tools.browser import BrokenPageError
from lxml.etree import Comment


def try_remove(parser, base_element, selector):
    try:
        base_element.remove(parser.select(base_element, selector, 1))
    except (BrokenPageError, ValueError):
        pass


def try_drop_tree(parser, base_element, selector):
    for el in parser.select(base_element, selector):
        el.drop_tree()


def remove_from_selector_list(parser, base_element, selector_list):
    for selector in selector_list:
        base_element.remove(parser.select(base_element, selector, 1))


def try_remove_from_selector_list(parser, base_element, selector_list):
    for selector in selector_list:
        try_remove(parser, base_element, selector)


def drop_comments(base_element):
    for comment in base_element.getiterator(Comment):
        comment.drop_tree()

# Replace relative url in link and image with a complete url
# Arguments: the html element to clean, and the domain name (with http:// prefix)


def clean_relativ_urls(base_element, domain):
    for a in base_element.findall('.//a'):
        if "href" in a.attrib:
            if a.attrib["href"] and a.attrib["href"][0:7] != "http://" and a.attrib["href"][0:7] != "https://":
                a.attrib["href"] = domain + a.attrib["href"]
    for img in base_element.findall('.//img'):
        if img.attrib["src"][0:7] != "http://" and img.attrib["src"][0:7] != "https://":
            img.attrib["src"] = domain + img.attrib["src"]


class NoAuthorElement(BrokenPageError):
    pass


class NoBodyElement(BrokenPageError):
    pass


class NoTitleException(BrokenPageError):
    pass


class NoneMainDiv(AttributeError):
    pass


class Article(object):
    author = u''
    title = u''

    def __init__(self, browser, _id):
        self.browser = browser
        self.id = _id
        self.body = u''
        self.url = u''
        self.date = None


class GenericNewsPage(BasePage):
    __element_body = NotImplementedError
    __article = Article
    element_title_selector = NotImplementedError
    main_div = NotImplementedError
    element_body_selector = NotImplementedError
    element_author_selector = NotImplementedError

    def get_body(self):
        return self.parser.tostring(self.get_element_body())

    def get_author(self):
        try:
            return self.get_element_author().text_content().strip()
        except (NoAuthorElement, NoneMainDiv):
            #TODO: Mettre un warning
            return self.__article.author

    def get_title(self):
        try:
            return self.parser.select(
                self.main_div,
                self.element_title_selector,
                1).text_content().strip()
        except AttributeError:
            if self.main_div is None:
                #TODO: Mettre un warning
                return self.__article.title
            else:
                raise
        except BrokenPageError:
            if self.element_title_selector == 'h1':
                raise NoTitleException("no title on %s" % (self.browser))
            self.element_title_selector = "h1"
            return self.get_title()

    def get_element_body(self):
        try:
            return self.parser.select(self.main_div, self.element_body_selector, 1)
        except BrokenPageError:
            raise NoBodyElement("no body on %s" % (self.browser))
        except AttributeError:
            if self.main_div is None:
                raise NoneMainDiv("main_div is none on %s" % (self.browser))
            else:
                raise

    def get_element_author(self):
        try:
            return self.parser.select(self.main_div, self.element_author_selector, 1)
        except BrokenPageError:
            raise NoAuthorElement()
        except AttributeError:
            if self.main_div is None:
                raise NoneMainDiv("main_div is none on %s" % (self.browser))
            else:
                raise

    def get_article(self, _id):
        __article = Article(self.browser, _id)
        __article.author = self.get_author()
        __article.title  = self.get_title()
        __article.url    = self.url
        __article.body   = self.get_body()

        return __article