## ##The first rule of VideoScriptClub is : You shouldn't talk about this script. ##I accept no liability if anyone uses this. ## ##Just never, ever use this as it ##certainly contravenes the terms and ##conditions of use your video hosting site ## ##If you do try to use it you need ##python2.4 from www.python.org ## ##Greg MC 1/1/2007 import urllib2,re,string import sys,os def getDefaultDir(default = "help?"): if os.environ.has_key('HOME'): #nix return os.environ['HOME']+'/Desktop/' elif os.environ.has_key('USERPROFILE'): #win return os.environ['USERPROFILE']+'\\Bureau\\' #I live in France else: return default def openCx(url, data=None): req = urllib2.Request(url) if not data is None: req.add_data(data) # Try to mimic Firefox, at least a little bit req.add_header('User-Agent', 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1) Gecko/20061010 Firefox/2.0') req.add_header('Accept-Charset', 'ISO-8859-1,utf-8;q=0.7,*;q=0.7') req.add_header('Accept', 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5') req.add_header('Accept-Language', 'en-us,en;q=0.5') return urllib2.urlopen(req) class Scraper(object): '''wrapper for the download''' def __init__(self,url): self.baseUrl = url self.fn = None self.url = None mm = re.search('[\.|/](?P\w+)\.com',url) if not mm: print "Scraper: not a valid url", url else: self.site = mm.group('site') print url+"\nSITE =", self.site cx = openCx(url) try: cx = openCx(url) self.pageData = cx.read() except: print "Scraper: failed to read page",url try: self.__getattribute__("scrape_"+self.site+"_page")() except: self.url = None print self.site, "not implemented yet" print "scraper done" def getTitle(self,pageData): '''html page contains title in a meta ie. Google video, DailyMotion ''' p = re.compile('(?P<title>.+?)') m = p.search(pageData) if m.group('title'): title = m.group('title').strip() title = "".join(title.split('-')[:2]) return title def stripBadChar(self,fn): p = re.compile(r'''\w|\s''') return "".join(p.findall(fn)) def scrape_ifilm_page(self): title = re.search('

(?P.*?)</h2>', self.pageData) title = title.group('title') self.url = "http://download.ifilm.com/flv/%s_300.flv"%self.baseUrl.split('/')[-1] self.fn = self.stripBadChar(title)+"_IFILM" def scrape_myspace_page(self): title = re.search('videoid=\d+\"\>(?P<title>.+?)\<', self.pageData) title = title.group('title') vid_id = re.search("\d+",self.baseUrl).group(0) #take the first number in URL tail = [x for x in vid_id[-4:]] tail.reverse() tail = "".join(tail) head = vid_id[:len(vid_id)-5] head = string.zfill(0,7-len(head))+head #pad to 7 digits self.url = "http://content.movies.myspace.com/%s/%s/%s/%s.flv"%(head,tail[:2],tail[2:4],vid_id) self.fn = self.stripBadChar(title)+"_myspace" def scrape_dailymotion_page(self): title = self.getTitle(self.pageData) fn = title[:title.find('- Dailymotion')] p = '<param name="flashvars" value="url=(?P<vid_url>[^&]+)' m = re.search(p,self.pageData) self.url = urllib2.unquote(m.group('vid_url')) self.fn = self.stripBadChar(fn) def scrape_youtube_page(self): m = re.search('watch_fullscreen\\?(?P<vid_query>.*?)&fs=1&title=" \\+ "(?P<title>[^"]*)"', self.pageData) self.fn = self.stripBadChar(m.group('title')) self.url = "http://youtube.com/get_video.php?%s" % m.group('vid_query') def scrape_google_page(self): p = re.compile(r'%2Fvideodownload([^&]+)') m = p.search(self.pageData) if m: url = "http%3A%2F%2Fvp.video.google.com"+m.group(0) self.url = urllib2.unquote(url) self.fn = self.stripBadChar(self.getTitle(self.pageData)) else: self.url = None class Downloader(object): def __init__(self): self.reset() def reset(self): self.block_size = 1024**2 #ask for a gig at a time self.size = self.totalSize = 0 #download stats self.url = self.fn = None self.cx = self.fp = None #filelikes self.busy = self.ok = False #boolean states freeMessage() if self.fp: self.fp.close() #housekeeping def setData(self,vidData): if not self.busy and vidData.url: self.url = vidData.url self.cx = None self.fn = "%s%s.flv"%(vidDir.get(),vidData.fn) self.busy = False self.ok = True busyMessage() STDOUT.clear() else: self.ok = False def getChunk(self): #this is a callback for a progress bar #returns fraction left if self.ok and not self.cx: self.openCx() if self.cx and not self.busy and self.size >0: try: self.fp.write(self.cx.read(self.block_size)) self.size -= self.block_size if self.size<0: STDOUT.write("DONE") freeMessage() except: STDOUT.write('FAILED: download interrupted') self.reset() return 1.0 return 1.0*self.size/self.totalSize return 1.0 def openCx(self): STDOUT.write("opening cx") try: self.cx = openCx(self.url) except: STDOUT.write("FAILED: can't open cx") self.cx = None return self.totalSize = self.size = int(self.cx.info()['Content-Length']) STDOUT.write("video : %s \nsaving to %s.flv"%(self.url,self.fn) ) STDOUT.write("SIZE = %dK"%(self.totalSize/1000)) self.fp = open(self.fn,"wb") #this shouldn't be here ##gui stuff ##Tkinter sucks should use swt import Tkinter class textPad(object): width = 400 height = 100 curLine = 0 #line count def __init__(self,parent): self.canvas = Tkinter.Canvas(parent,width=self.width,height=self.height, bg="white") self.canvas.pack() #self.canvas.pack(side=Tkinter.BOTTOM, fill=Tkinter.BOTH,expand=1) def write(self,txt): self.canvas.create_text(10,10+20*self.curLine,text=txt,fill='red', anchor=Tkinter.NW, tag ="TXT") self.curLine += (1 + len(re.findall("\n",txt)) )#take a new line def clear(self): self.canvas.delete("TXT") class progressBar(object): width = 300 height = 40 callback = None def __init__(self,parent,callback = None): box = Tkinter.Frame(parent,height=self.height,width=90,bd=1, relief=Tkinter.SUNKEN) box.pack(fill=Tkinter.X, padx=1, pady=1) #populate frame downLoad2 = Tkinter.Label(box, text="DL progress") downLoad2.pack(side = Tkinter.LEFT) self.bar = Tkinter.Canvas(box,width = self.width, height = self.height) self.bar.pack(side = Tkinter.RIGHT) self.bar.create_rectangle(0,self.height,self.width,0, fill = "blue") self.callback = callback #behavior self.bar.after(500,self.update) def update(self): fraction = 1.0-self.callback() self.bar.create_rectangle(0,self.height,self.width,0, fill = "blue") self.bar.create_rectangle(0,self.height,int(self.width*fraction),0, fill = "red") self.bar.after(500, self.update) #this is the right order def doVid(): '''callback talks to Scraper and Download''' url = sourceURL.get()#take copy vidData = Scraper(url) download.setData(vidData) def busyMessage(): sourceURL.set("I'm busy.....")#overwrite def freeMessage(): sourceURL.set('done, your url?')#overwrite def main(): global root, sourceURL, vidDir, download, STDOUT #create root first else ERROR root = Tkinter.Tk() root.title('Greg\'s GenericVideoDownloader: paste URL below') sourceURL = Tkinter.StringVar()#this should belong to download download = Downloader() vidDir = Tkinter.StringVar() vidDir.set(getDefaultDir("paste download path here?")) #default download box1 = Tkinter.Frame(root,height=18,width=90,bd=1, relief=Tkinter.SUNKEN) box1.pack(fill=Tkinter.X, padx=1, pady=1) urlFrom = Tkinter.Label(box1, text="Source URL") urlFrom.pack(side = Tkinter.LEFT) Tkinter.Button(root, text="OK", command=doVid ).pack() progBar = progressBar(root,callback = download.getChunk) Tkinter.Entry(box1, width=60,textvariable= sourceURL).pack(side=Tkinter.RIGHT) box2 = Tkinter.Frame(root,height=18,width=90,bd=1, relief=Tkinter.SUNKEN) box2.pack(fill=Tkinter.X, padx=1, pady=1) downLoad2 = Tkinter.Label(box2, text="Download directory") downLoad2.pack(side = Tkinter.LEFT) Tkinter.Entry(box2, width=50, textvariable= vidDir).pack(side=Tkinter.RIGHT) STDOUT = textPad(root)#text window root.mainloop() main() ## ##You shouldn't use this. ##I accept no liability if anyone uses this. ## ##Just never, ever use this as it ##probably contravenes the terms and ##conditions of use of video hosting sites ## ##If you do try to use it you need ##python2.4 from www.python.org ## ##Greg MC 1/1/2007 import urllib2,re import sys,os def stripBadChar(fn): p = re.compile(r'''\w|\s''') return "".join(p.findall(fn)) def url2file(cx,fp): '''downloads to a file this has a text progress bar''' headers = cx.info() size = int(headers['Content-Length']) block_size= 1024**2 #ask for a gig at a time print ".-"*int(size/block_size) for i in range(int(size/block_size)+1): fp.write (cx.read(block_size)) print '*', sys.stdout.flush() print "\n done" def openCx(url, data=None): req = urllib2.Request(url) if not data is None: req.add_data(data) # Try to mimic Firefox, at least a little bit req.add_header('User-Agent', 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1) Gecko/20061010 Firefox/2.0') req.add_header('Accept-Charset', 'ISO-8859-1,utf-8;q=0.7,*;q=0.7') req.add_header('Accept', 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5') req.add_header('Accept-Language', 'en-us,en;q=0.5') return urllib2.urlopen(req) def parseIFILMPage(url,pageData): title = re.search('<h2 id="itemtitle[_|\d]*">(?P<title>.*?)</h2>',pageData) title = title.group('title') url = "http://download.ifilm.com/flv/%s_300.flv"%url.split('/')[-1] return url, stripBadChar(fn)+"_IFILM" def parseDailyMotionPage(url,pageData): title = re.search('<title>(?P<title>.*?)',pageData) title = title.group('title') fn = title[:title.find('- Dailymotion')] #print urllib2.unquote(soup.findAll('meta')[3]['content']) #ss = soup.findAll('param')[1]#this [1] is prety lame p = '[^"]*)"', data) fn = m.group('title') url = "http://youtube.com/get_video.php?%s" % m.group('vid_query') return url, stripBadChar(fn) def getVid(url,fn,vidDir): print "video :",fn print "saving to %s.FLV"%fn try: cx = openCx(url) except: print 'FAILED' return None fp = open("%s%s.FLV"%(vidDir,fn),"wb") try: url2file(cx,fp) except: print 'FAILED', fp.close() return None fp.close() return 1 def doVid(): url = sourceURL.get()#take copy if url ==_prompt1: return sourceURL.set("I'm busy.....")#overwrite site = url.split('.')[1].lower() print site cx = openCx(url) pageData = cx.read() if site == "ifilm": url,fn = parseIFILMPage(url,pageData) elif site =="dailymotion": url,fn = parseDailyMotionPage(url,pageData) elif url.lower().find("youtube")>-1: url,fn = parseYoutubePage(url,pageData) print url if getVid(url,fn,vidDir.get()): sourceURL.set('...OK next vid') else: sourceURL.set('...FAILED another vid?') import Tkinter _prompt1 = 'your url?' def main(): global root, sourceURL, vidDir #create root first else ERROR root = Tkinter.Tk() root.title('Greg\'s GenericVideoDownloader: paste URL below') sourceURL = Tkinter.StringVar() sourceURL.set(_prompt1) vidDir = Tkinter.StringVar() vidDir.set(os.environ['HOME']+'/Desktop/') #default download Tkinter.Entry(root, width=70, textvariable= sourceURL).pack() box = Tkinter.Frame(root,height=25,width=90) box.pack(fill=Tkinter.X, padx=5, pady=5) downLoad2 = Tkinter.Label(box, text="Dowload directory") downLoad2.pack(side = Tkinter.LEFT) Tkinter.Entry(root, width=70, textvariable= vidDir).pack() Tkinter.Button(root, text="OK", command=doVid ).pack() root.mainloop() main()