Scan for Files and Parse HTML to Find Images
scan_ext(‘c:\\python36\\tools’, ‘py’)
beer.py
eiffel.py
hanoi.py
life.py
markov.py
mcast.py
queens.py
redemo.py
rpython.py
rpythond.py
sortvisu.py
ss1.py
vector.py
makelocalealias.py
msgfmt.py
pygettext.py
unparse.py
ChipViewer.py
ColorDB.py
DetailsViewer.py
ListViewer.py
Main.py
pyColorChooser.py
PyncheWidget.py
StripViewer.py
Switchboard.py
TextViewer.py
TypeinViewer.py
__init__.py
2to3.py
abitype.py
analyze_dxp.py
byext.py
byteyears.py
checkpip.py
checkpyc.py
cleanfuture.py
combinerefs.py
copytime.py
crlf.py
db2pickle.py
diff.py
dutree.py
eptags.py
find-uname.py
finddiv.py
findlinksto.py
findnocoding.py
find_recursionlimit.py
fixcid.py
fixdiv.py
fixheader.py
fixnotice.py
fixps.py
generate_opcode_h.py
get-remote-certificate.py
google.py
gprof2html.py
h2py.py
highlight.py
ifdef.py
import_diagnostics.py
lfcr.py
linktree.py
lll.py
mailerdaemon.py
make_ctype.py
md5sum.py
mkreal.py
ndiff.py
nm2def.py
objgraph.py
parseentities.py
parse_html5_entities.py
patchcheck.py
pathfix.py
pdeps.py
pickle2db.py
pindent.py
ptags.py
pydoc3.py
pysource.py
pyvenv.py
reindent-rst.py
reindent.py
rgrep.py
run_tests.py
serve.py
suff.py
svneol.py
texi2html.py
treesync.py
untabify.py
which.py
win_add2path.py
print(report(‘c:\\python36\\tools’))
[‘c:\\python36\\tools\\demo\\beer.py’, ‘c:\\python36\\tools\\demo\\eiffel.py’,
‘c:\\python36\\tools\\demo\\hanoi.py’, ‘c:\\python36\\tools\\demo\\life.py’,
‘c:\\python36\\tools\\demo\\markov.py’, ‘c:\\python36\\tools\\demo\\mcast.py’,
‘c:\\python36\\tools\\demo\\queens.py’, ‘c:\\python36\\tools\\demo\\redemo.py’,
‘c:\\python36\\tools\\demo\\rpython.py’, ‘c:\\python36\\tools\\demo\\rpythond.py’,
‘c:\\python36\\tools\\demo\\sortvisu.py’, ‘c:\\python36\\tools\\demo\\ss1.py’,
‘c:\\python36\\tools\\demo\\vector.py’, ‘c:\\python36\\tools\\i18n\\makelocalealias.py’,
‘c:\\python36\\tools\\i18n\\msgfmt.py’, ‘c:\\python36\\tools\\i18n\\pygettext.py’,
‘c:\\python36\\tools\\parser\\unparse.py’, ‘c:\\python36\\tools\\pynche\\ChipViewer.py’,
‘c:\\python36\\tools\\pynche\\ColorDB.py’, ‘c:\\python36\\tools\\pynche\\DetailsViewer.py’,
‘c:\\python36\\tools\\pynche\\ListViewer.py’, ‘c:\\python36\\tools\\pynche\\Main.py’,
‘c:\\python36\\tools\\pynche\\PyncheWidget.py’, ‘c:\\python36\\tools\\pynche\\StripViewer.py’,
‘c:\\python36\\tools\\pynche\\Switchboard.py’, ‘c:\\python36\\tools\\pynche\\TextViewer.py’,
‘c:\\python36\\tools\\pynche\\TypeinViewer.py’, ‘c:\\python36\\tools\\pynche\\__init__.py’,
‘c:\\python36\\tools\\pynche\\html40colors.txt’, ‘c:\\python36\\tools\\pynche\\namedcolors.txt’,
‘c:\\python36\\tools\\pynche\\pyColorChooser.py’, ‘c:\\python36\\tools\\pynche\\pynche.pyw’,
‘c:\\python36\\tools\\pynche\\webcolors.txt’, ‘c:\\python36\\tools\\pynche\\websafe.txt’,
‘c:\\python36\\tools\\scripts\\2to3.py’, ‘c:\\python36\\tools\\scripts\\abitype.py’,
‘c:\\python36\\tools\\scripts\\analyze_dxp.py’, ‘c:\\python36\\tools\\scripts\\byext.py’,
‘c:\\python36\\tools\\scripts\\byteyears.py’, ‘c:\\python36\\tools\\scripts\\checkpip.py’,
‘c:\\python36\\tools\\scripts\\checkpyc.py’, ‘c:\\python36\\tools\\scripts\\cleanfuture.py’,
‘c:\\python36\\tools\\scripts\\combinerefs.py’, ‘c:\\python36\\tools\\scripts\\copytime.py’,
‘c:\\python36\\tools\\scripts\\crlf.py’, ‘c:\\python36\\tools\\scripts\\db2pickle.py’,
‘c:\\python36\\tools\\scripts\\diff.py’, ‘c:\\python36\\tools\\scripts\\dutree.py’,
‘c:\\python36\\tools\\scripts\\eptags.py’, ‘c:\\python36\\tools\\scripts\\find-uname.py’,
‘c:\\python36\\tools\\scripts\\find_recursionlimit.py’, ‘c:\\python36\\tools\\scripts\\finddiv.py’,
‘c:\\python36\\tools\\scripts\\findlinksto.py’, ‘c:\\python36\\tools\\scripts\\findnocoding.py’,
‘c:\\python36\\tools\\scripts\\fixcid.py’, ‘c:\\python36\\tools\\scripts\\fixdiv.py’,
‘c:\\python36\\tools\\scripts\\fixheader.py’, ‘c:\\python36\\tools\\scripts\\fixnotice.py’,
‘c:\\python36\\tools\\scripts\\fixps.py’, ‘c:\\python36\\tools\\scripts\\generate_opcode_h.py’,
‘c:\\python36\\tools\\scripts\\get-remote-certificate.py’, ‘c:\\python36\\tools\\scripts\\google.py’,
‘c:\\python36\\tools\\scripts\\gprof2html.py’, ‘c:\\python36\\tools\\scripts\\h2py.py’,
‘c:\\python36\\tools\\scripts\\highlight.py’, ‘c:\\python36\\tools\\scripts\\ifdef.py’,
‘c:\\python36\\tools\\scripts\\import_diagnostics.py’, ‘c:\\python36\\tools\\scripts\\lfcr.py’,
‘c:\\python36\\tools\\scripts\\linktree.py’, ‘c:\\python36\\tools\\scripts\\lll.py’,
‘c:\\python36\\tools\\scripts\\mailerdaemon.py’, ‘c:\\python36\\tools\\scripts\\make_ctype.py’,
‘c:\\python36\\tools\\scripts\\md5sum.py’, ‘c:\\python36\\tools\\scripts\\mkreal.py’,
‘c:\\python36\\tools\\scripts\\ndiff.py’, ‘c:\\python36\\tools\\scripts\\nm2def.py’,
‘c:\\python36\\tools\\scripts\\objgraph.py’, ‘c:\\python36\\tools\\scripts\\parse_html5_entities.py’,
‘c:\\python36\\tools\\scripts\\parseentities.py’, ‘c:\\python36\\tools\\scripts\\patchcheck.py’,
‘c:\\python36\\tools\\scripts\\pathfix.py’, ‘c:\\python36\\tools\\scripts\\pdeps.py’,
‘c:\\python36\\tools\\scripts\\pickle2db.py’, ‘c:\\python36\\tools\\scripts\\pindent.py’,
‘c:\\python36\\tools\\scripts\\ptags.py’, ‘c:\\python36\\tools\\scripts\\pydoc3.py’,
‘c:\\python36\\tools\\scripts\\pysource.py’, ‘c:\\python36\\tools\\scripts\\pyvenv.py’,
‘c:\\python36\\tools\\scripts\\reindent-rst.py’, ‘c:\\python36\\tools\\scripts\\reindent.py’,
‘c:\\python36\\tools\\scripts\\rgrep.py’, ‘c:\\python36\\tools\\scripts\\run_tests.py’,
‘c:\\python36\\tools\\scripts\\serve.py’, ‘c:\\python36\\tools\\scripts\\suff.py’,
‘c:\\python36\\tools\\scripts\\svneol.py’, ‘c:\\python36\\tools\\scripts\\texi2html.py’,
‘c:\\python36\\tools\\scripts\\treesync.py’, ‘c:\\python36\\tools\\scripts\\untabify.py’,
‘c:\\python36\\tools\\scripts\\which.py’, ‘c:\\python36\\tools\\scripts\\win_add2path.py’,
‘c:\\python36\\tools\\pynche\\X\\rgb.txt’, ‘c:\\python36\\tools\\pynche\\X\\xlicense.txt’]
data(“http://ovid.cs.depaul.edu/Classes/CSC243-F18/firsthtml.html”)
Our first web-pageFirstsectionSubsection 1
This is very important.
Second sentence.
LoremIpsum
A Picture
Go to my web-page
Hello World
images(“http://ovid.cs.depaul.edu/Classes/CSC243-F18/firsthtml.html”)
pic1a.jpg
Solution
########################################################
# Lab assignment 9: Tree recursion and HTML Parsers
#
# This lab includes some examples of recursion
# over folder structures, extending the scanning examples.
# The last two questions ask you to play with the
# HTML parser and extend its functionality a bit.
#
########################################################
#
# Tree recursion and HTML Parsers
#
# You will find recursion over folder structures.
#
########################################################
##########################################################################
#1 scan path and everything below it for all files ending in extension #
# ext, e.g. scan(‘E:\\’, ‘txt’) would search for .txt files on the drive#
# E:\ (most likely a USB stick), and scan(‘E:\\’, ‘py’) for .py files. #
# Print the name of all files satisfying the criterion. # #
##########################################################################
fromos import listdir
fromos.path import join,isfile,isdir
defscan_ext(path, ext):
‘print all files with extension ext in or below path’
pass
#Hint: base case as before: path is a file
#Hint 2: for checking extension, endswith may come in handy
#test with a USB stick, or a subfolder of C: which is relatively small
###########################################################################
#2 scan path and everything below it and return a list of all files you #
# found; the list should contain the full paths for every file you found #
###########################################################################
def report(path):
‘report all files in path or below it’
pass
#Hint: accumulate
#Hint 2: urljoin
##########################################################################
#3 HTMLParser to extract page content #
# you want to write a parser and a function that allows you to extract #
# all the text (data) on a web-page at a given url, and simply prints it#
##########################################################################
fromhtml.parser import HTMLParser
fromurllib.request import urlopen
classDataParser(HTMLParser):
‘content parser’
defhandle_starttag(self, tag, attrs):
pass
defhandle_endtag(self, tag):
pass
defhandle_data(self, data):
pass
def data(url):
‘print all the content of the page at url’
pass
#Hint: you need to provide one of the three methods only. Which?
#Hint 2: use print(…, end = ”) to make the output more compact
#
#you can test with http://ovid.cs.depaul.edu/Classes/CSC243-F18/firsthtml.html
#http://www.yahoo.com or other web-pages
##########################################################################
#4 HTMLParser for images #
# every image tag has a src attribute: #
# <imgsrc = ‘pic1a.jpg’ height = ‘100’> #
# write a function and a parser that print #
# all the sources of images on the web-page at url; for the above #
# image tag, it should print pic1a.jpg #
##########################################################################
fromhtml.parser import HTMLParser
fromurllib.request import urlopen
classImageParser(HTMLParser):
‘image parser’
defhandle_starttag(self, tag, attrs):
pass
defhandle_endtag(self, tag):
pass
defhandle_data(self, data):
pass
def images(url):
‘print all sources of images’
pass
#Hint: src is an attribute, attributes get reported in attrs; in a first step
# printattrs for all image tags; then add code to drill down
# to the srcattribute
#