#! /usr/bin/env python # -*- coding: utf-8 -*- #====================================================== # leeurl.py - lee las urls contenidas en una página web # utilizando el módulo BeautifulSoup #------------------------------------------------------ # FJA - fja@neocipres.com Junio de 2016 #====================================================== import sys import re import urllib from bs4 import BeautifulSoup pag = '' pag = '<html lang="es-es">' pag = pag + "\n" + '<head>' cad = '<meta http-equiv="content-type" content="text/pag; charset=utf-8">' pag = pag + cad pag = pag + '</head>' + "\n" url = raw_input('Introduzca URL: ') if url.find('http://') == -1: url = 'http://' + url def raya(n, ca="="): rep = lambda p: p[0] * p[1] return rep((n, ca)) def contdiv(soup): divs = 0 for div in soup.findAll("div"): divs += 1 return str(divs) try: html = urllib.urlopen(url).read() sopa = BeautifulSoup(html) except IOError as e: print '\n\t' print 'Error: ', e, sys.exc_info()[0], '\n' print 'Ejemplo de URL: http://www.gnu.org \n' sys.exit(0) titulo = sopa.title.string enlaces = sopa.findAll('a', attrs={'href': re.compile("^http://")}) print raya(80) print '\t Título de la página: ', titulo pag = pag + '<hr>' + '<h2 align="center">' + titulo + '</h2>' + '<hr>' print raya(80, '-') print '\t Etiquetas div: ', contdiv(sopa) print raya(80, '-') for i, link in enumerate(enlaces): l = link.get('href', None) linea = '\t ' + str(i + 1) + ': ' + l print linea pag = pag + '<p><a target="_blank" href=' + l + '>' + linea + '</a></p>' print raya(80) try: f = open("/tmp/out.html", "w") f.write(pag.encode('utf-8')) f.close() print "\t \n Generado el archivo /tmp/out.html ... \n" except IOError, e: print '\n\t' print 'Error: ', e, sys.exc_info()[0], '\n' except: print '\n\t' print '\t Error: ', sys.exc_info()[0], '\n'