#!/usr/bin/env python import base64 import sys import re import urllib import urlparse STATE_NOMATCH = 0 STATE_SAWBRACKET = 1 STATE_INTAG = 2 operation = "archive" inputfilename = sys.argv[1] inputfile = urllib.urlopen(inputfilename) using_http = 0 if inputfilename[:5].lower() == "http:": using_http = 1 outputfilename = sys.argv[2] outputfile = file(sys.argv[2], "w") done = 0 state = STATE_NOMATCH buffer = "" # won't allow escaped quotes inside "src" #img_re = re.compile('^\s*img [^>]*src\s*=\s*(?P["\'])(?P[^(?P=quote)]+)(?P=quote)', re.IGNORECASE) img_re = re.compile('^\s*img [^>]*src\s*=\s*(?P(?P["\'])(?P[^"\']+)(?P=quote))', re.IGNORECASE) img_re2 = re.compile('^\s*img [^>]*src\s*=(?P[^ "\']+)', re.IGNORECASE) buffer = inputfile.read(4096) while not done: if state == STATE_NOMATCH: match = buffer.find("<") if match != -1: # left brace found outputfile.write(buffer[:match+1]) buffer = buffer[match+1:] state = STATE_SAWBRACKET continue else: # left brace not found outputfile.write(buffer) buffer = "" elif state == STATE_SAWBRACKET: # make sure we have the whole tag if buffer.find(">") == -1: # wait for the next iteration, so we have all the data pass match = img_re.search(buffer) if match != None: # "img" found. # first dump until the beginning of the value for the 'src' attribute outputfile.write(buffer[:match.start('srcwithquotes')]) buffer = buffer[match.end('srcwithquotes'):] imgsrc = match.group('src') print imgsrc # guess mime type extension = imgsrc[imgsrc.rfind(".")+1:] if extension == "gif": mimetype="image/gif" elif extension == "jpg" or extension == "jpeg": mimetype="image/jpeg" elif extension == "png": mimetype="image/png" # todo: if not a filename, if we can't find it, etc if using_http: imgfile = urllib.urlopen(urlparse.urljoin(inputfilename, imgsrc)) else: imgfile = file("./" + imgsrc) outputfile.write('"data:' + mimetype + ";base64,") base64.encode(imgfile, outputfile) outputfile.write('"') imgfile.close() state = STATE_INTAG continue else: # the tag was not an "img" tag, or did not have the format we require state = STATE_INTAG continue elif state == STATE_INTAG: match = buffer.find(">") if match != -1: # right brace found outputfile.write(buffer[:match+1]) buffer = buffer[match+1:] state = STATE_NOMATCH continue else: outputfile.write(buffer) buffer = "" read = inputfile.read(4096) if read == "": done = 1 buffer += read outputfile.write(buffer) inputfile.close() outputfile.close()