#################################################################################
##
## webcheck 1.0 - Monitor URLs for changes
##
## Copyright (c) 2002 Steffen Siebert (siebert@SteffenSiebert.de)
##
#################################################################################
##
## This program is free software; you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation; either version 2 of the License, or
## (at your option) any later version.
##
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
## GNU General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with this program; if not, write to the Free Software
## Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
##
#################################################################################
## Requirements                                                                ##
#################################################################################
##
## Python 2.2 or later:
## <http://www.python.org>
##
## timeoutsocket.py:
## <http://www.timo-tasi.org/python/>
##
## Patched smtplib.py for authentification:
## <http://www.informatik.fh-muenchen.de/~ifw00065/VitaminP/>
##
## Both Python scripts are also available from the webcheck homepage:
## <http://www.SteffenSiebert.de/soft/python/webcheck.html>
##
#################################################################################
## Usage                                                                       ##
#################################################################################
##
## webcheck.py takes a list of URLs, retrieves them and reports whether
## the page changed since the last visit or not.
##
## The URLs are stored in a plain textfile called webcheck.lst.
## Each line contains one URL and consists of 4 columns separated
## by the pipe symbol '|'.
##
## Example:
## http://www.steffensiebert.de/soft/python/webcheck.py|Webcheck Homepage|0|
##
## The first column contains the URL, the second the description, the
## third the CRC (use 0 for new entries) computed of the webpage and
## the fifth is either empty or contains a regular expression (using
## the python re module syntax) starting with a plus sign '+' if only
## the matching part of the webpage should be checked or with
## a minus sign '-' if all but the matching part should be checked.
##
## When webcheck runs it prints messages to stdout and write a html page
## called webcheck.html to the current working directory with all URLs
## which changed since the last check.
##
## webcheck can also mail you a similar report via SMTP if you set
## the customizing variables mailserver, fromaddr and toaddrs.
##
## If you use the Microsoft Internet Explorer, you don't have to manually
## edit the webcheck.lst file. Just create a new folder in your Favorites
## (I named mine "webcheck") and set the customizing variable favoritePath
## to the path of this folder.
##
## webcheck synchronizes this folder and the webcheck.lst file. Just create
## a new Favorite or modifiy an existing one and the webcheck.lst will be
## updated and vice versa.
##
#################################################################################
## Support                                                                     ##
#################################################################################
##
## The latest version of webcheck is always available from my homepage:
## <http://www.SteffenSiebert.de/soft/python/webcheck.html>
##
## If you have bug reports, patches or some questions, just send a mail to
## <webcheck@SteffenSiebert.de>
##
#################################################################################

import os
import re
import urllib
import string
import sgmllib
import sys
import getopt
import zlib
import smtplib
import timeoutsocket
from sys import argv

class StripHTMLParser(sgmllib.SGMLParser):
    """
    StripHTMLParser Class
    Strips all html tags
    """
    
    def __init__(self):
        sgmllib.SGMLParser.__init__(self)
        self.result = ""

    def init(self):
        self.result = ""
        
    def handle_data(self, data):
        if data:
            self.result = self.result + data

    def start_script(self, attrs):
        pass

    def end_script(self):
        pass

    def start_noscript(self, attrs):
        pass

    def end_noscript(self):
        pass

    def start_iframe(self, attrs):
        pass

    def end_iframe(self):
        pass

    def start_a(self, attrs):
        pass

    def end_a(self):
        pass

    def start_body(self, attrs):
        pass

    def end_body(self):
        pass

    def do_img(self, attrs):
        pass

    def unknown_starttag(self, tag, attrs):
        pass
    
    def unknown_endtag(self, tag):
        pass

    def unknown_entityref(self, name):
        self.result = "%s&%s;" % (self.result, name)
        
def readFavorites():
    """
    reads all favorites and stores them in favoriteNames and favoriteUrls
    """
    files = os.listdir(favoritePath)

    for file in files:
        match = re.search("^(.*)\.url$", file)
        if match:
            name = match.group(1)
            input = open(favoritePath+file)
            for line in input.readlines():
                match = re.search("^URL=(.*)$", line)
                if match:
                    url = match.group(1)
                    favoriteNames[name] = url
                    favoriteUrls[url] = name

def createFavorite(name, url):
    """
    creates a new favorite
    """
    print "Creating favorite %s" % name
    file = open(favoritePath+name+".url", "w")
    file.write("[InternetShortcut]\nURL=%s\n" % url)
    file.close()

def check(name, url, crc, expr):
    """
    checks a url for changes
    """
    global mail
    global stripHtmlParser
    print "Check %s:" % (name)
    try:
        con = urlopener.open('%s' % (url))
        output = con.read()
    except IOError, e:
        print "IOError %s" % e
        out.write("%s|%s|%s|%s\n" % (url, name, crc, expr))
        return
    except timeoutsocket.Timeout:
        print "Timeout"
        out.write("%s|%s|%s|%s\n" % (url, name, crc, expr))
        return
    if debug:
        dout = open(debug + "/" + name + ".html", "w")
        dout.write(output)
        dout.close()

    if expr != '':
        if expr[0] == '+':
            print "Using only match %s" % expr[1:]
            match = re.search(expr[1:], output)
            if match:
                output = match.group(0)
            else:
                print "Regexp didn't match!"
        elif expr[0] == '-':
            print "Delete match %s" % expr[1:]
            output = re.sub(expr[1:], "", output)
        else:
            print "Filter %s doesn't start with '+' or '-'" % expr

        if debug:
            dout = open(debug + "/" + name + "-filtered.html", "w")
            dout.write(output)
            dout.close()

    try:
        stripHtmlParser.feed(output)
        stripHtmlParser.close()
        output = stripHtmlParser.result
    except sgmllib.SGMLParseError, message:
        print "HTML parse error:"
        print message
        print "Checking page with html tags!"
        stripHtmlParser = StripHTMLParser()
        pass
    
    output = re.sub('\s+', ' ', output)
    output = string.strip(output)
    if debug:
        dout = open(debug + "/" + name + ".txt", "w")
        dout.write(output)
        dout.close()

    newcrc = zlib.crc32(output)
    stripHtmlParser.init()
    out.write("%s|%s|%s|%s\n" % (url, name, newcrc, expr))
    if long(crc) == long(newcrc):
        print "No changes!"
    elif long(crc) == 0:
        print "New URL"
    else:
        print "CRC mismatch. Webpages has changed!"
        hout.write('%s: <a href="%s">%s</a><br>\n' % (name, url, url))
        mail = mail + '%s changed:\n<%s>\n\n' % (name, url)

###########################
## Customizing variables ##
###########################

# Set proxy to proxy url for using a http-proxy
proxy = None
#proxy = 'http://<proxy>:<port>/'

# Set favoritePath to the folder where webcheck should read and
# store internet explorer favorites
favoritePath = None
#favoritePath = "c:/dokumente und einstellungen/<loginname>/favoriten/webcheck/"

# Webcheck can notify you by mail using SMTP
# Define your SMTP server here
mailserver = None
#mailserver = "mail.gmx.net"

# If the SMTP server need authentification,
# define mailuser and password here
mailuser = None
#mailuser = "<smtp server user>"
#password = "<smtp server password>"

# Set sender and recipient email address
fromaddr = "<email sender address>"
toaddrs  = "<email recipient address>"

# You can change the default timeout value of 20 seconds
timeoutsocket.setDefaultSocketTimeout(20)

# Debug - Save retrieved webpages to given directory (which must exist)
debug = None
#debug = "c:/temp/webdebug"

####################################################
## No user changes should be necessary below this ##
####################################################
inputFile = 'webcheck.lst'
outputFile = 'webcheck.tmp'
backupFile = 'webcheck.bak'
htmlFile = 'webcheck.html'

urlopener = urllib.FancyURLopener()
stripHtmlParser = StripHTMLParser()

favoriteNames = {}
favoriteUrls = {}
mail = ''
    
try:
    optlist, args = getopt.getopt(argv[1:],'i')
except getopt.error, msg:
    sys.stdout = sys.stderr
    print "Error: %s" % msg
    sys.exit(1)

#for o, a in optlist:                                          
#    if o == '-i': indexOnly = 1

if proxy != None:
    os.environ['http_proxy'] = proxy

if favoritePath:
    readFavorites()

try:
    input = open(inputFile)
except IOError:
    input = None
    
out = open(outputFile, "w")
hout = open(htmlFile, "w")

hout.write("<html><head><title>Changed webpages</title></head><body><h1>Changed webpages</h1>\n")

if input:
    for line in input.readlines():
        match = re.search('^(#?)([^|]+)\|([^|]+)\|([^|]+)\|(.*)$',line)
        if match != None:
            comment = match.group(1)
            url = match.group(2)
            name = match.group(3)
            crc = match.group(4)
            expr = match.group(5)
            if crc == '':
                crc = '0'
    #        print "<%s> <%s> <%s>" % (url, name, crc)

            # Testen, ob in den Favoriten vorhanden
            try:
                furl = favoriteNames[name]
            except KeyError:
                furl = None

            if furl:
                if furl != url:
                    # URL hat sich geändert
                    print "%s: URL of favorite changed!" % name
                    print "New URL: %s" % furl
                    url = furl
                # Favorite ist bearbeitet und wird gelöscht 
                del favoriteNames[name]
                del favoriteUrls[furl]

            try:
                fname = favoriteUrls[url]
            except KeyError:
                fname = None

            if fname:
                if fname != name:
                    # Name hat sich geändert
                    print "%s: Name of favorite changed!" % name
                    print "New Name: %s" % fname
                    name = fname
                # Favorite ist bearbeitet und wird gelöscht 
                del favoriteNames[fname]
                del favoriteUrls[url]

            if favoritePath and furl == None and fname == None:
                createFavorite(name, url)

            if comment:
                out.write(line)
            else:
                check(name, url, crc, expr)

        else:
            commentline = re.search('^#.*$',line)
            if commentline != None:
                # Kommentarzeile
                out.write("%s\n" % commentline.group(0))
                continue
            else:
                print 'Illegal line in %s: >%s<' % (inputFile,line)
                out.write("%s" %line)

    input.close()

# Restliche Favoriten bearbeiten
for key in favoriteNames.keys():
    name = key
    url = favoriteNames[key]
    print "New favorite %s: <%s>" % (name, url)
    check(key, favoriteNames[key], 0, '')

out.close()
hout.write("</body></html>\n")
hout.close()

if mailserver != None and mail != '':
    print "Sending report via mail to <%s>" % toaddrs
    mail = 'To: %s\nSubject: Webcheck report\n%s\nThis mail was generated and sent by webcheck.py:\n<http://www.SteffenSiebert.de/soft/python/webcheck.html>\n' % (toaddrs, mail)
    server = smtplib.SMTP(mailserver)
    # server.set_debuglevel(1)
    if mailuser != None:
        server.login(mailuser, password)
    server.sendmail(fromaddr, toaddrs, mail)
    server.quit()

try:
    os.unlink(backupFile)
except OSError:
    pass

try:
    os.rename(inputFile, backupFile)
except OSError:
    # Some file might not exist
    pass

try:
    os.rename(outputFile, inputFile)
except OSError:
    # Some file might not exist
    pass

