Here is the problem:
I have an wget/httrack web site mirror and I want to get a list of all “http(s)” the site contennt is pointin at, even javascript, so just html parsing woulnd’t work.
First: To grep and extract any http(s)
Second: To clean the trailin ../whatever>”garbage here>?#blah… but keeping the “whatever”, i.e. last url location part.
#! /usr/bin/env python
import os
import re
site='www.example.es'
urls=''
outfile=open('urls.txt','w')
urls_dirt=os.popen('grep -irohE "https?://(.*)" ' + site + '| grep -v Binary').read()
for url in urls_dirt.split() :
if 'http' in url:
lead = url.rpartition('/')[:-1]
dirt = url.rpartition('/')[-1]
try :
m = re.search("[A-z]+", dirt)
trail = m.group(0)
except:
trail = ""
urls+= "".join(lead).replace('\"', '') + str(trail) + '\n'
outfile.write(urls)
outfile.close()
Similar thing for images:
#! /usr/bin/env python
import os
import re
site='www.example.es'
dest='mirror'
urls=''
outfile=open('images.txt','w')
urls_dirt=os.popen('grep -irohE "img src=(.*)" ' + site + '| grep -v Binary').read()
for url in urls_dirt.split() :
if 'src=' in url:
urls += url.replace('src=','http://' +site ).replace('\"','').replace('\'', '') + '\n'
outfile.write(urls)
outfile.close()
Advertisement

