<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0"
	xmlns:content="http://purl.org/rss/1.0/modules/content/"
	xmlns:wfw="http://wellformedweb.org/CommentAPI/"
	xmlns:dc="http://purl.org/dc/elements/1.1/"
	xmlns:atom="http://www.w3.org/2005/Atom"
	xmlns:sy="http://purl.org/rss/1.0/modules/syndication/"
	xmlns:slash="http://purl.org/rss/1.0/modules/slash/"
	>

<channel>
	<title>dev.enekoalonso.com &#187; crawl</title>
	<atom:link href="http://dev.enekoalonso.com/tag/crawl/feed/" rel="self" type="application/rss+xml" />
	<link>http://dev.enekoalonso.com</link>
	<description>having fun with code</description>
	<lastBuildDate>Wed, 12 Oct 2011 21:40:17 +0000</lastBuildDate>
	<language>en</language>
	<sy:updatePeriod>hourly</sy:updatePeriod>
	<sy:updateFrequency>1</sy:updateFrequency>
	
		<item>
		<title>Crawling sitemaps with Python</title>
		<link>http://dev.enekoalonso.com/2009/09/09/crawling-sitemaps-with-python/</link>
		<comments>http://dev.enekoalonso.com/2009/09/09/crawling-sitemaps-with-python/#comments</comments>
		<pubDate>Wed, 09 Sep 2009 21:59:08 +0000</pubDate>
		<dc:creator>Eneko Alonso</dc:creator>
				<category><![CDATA[uncategorized]]></category>
		<category><![CDATA[crawl]]></category>
		<category><![CDATA[http]]></category>
		<category><![CDATA[python]]></category>
		<category><![CDATA[request]]></category>
		<category><![CDATA[script]]></category>
		<category><![CDATA[sitemap]]></category>
		<category><![CDATA[url]]></category>

		<guid isPermaLink="false">http://dev.enekoalonso.com/?p=379</guid>
		<description><![CDATA[This a basic script I have created to crawl an xml sitemap file (does not support nested sitemaps). It will report if the request was successfully processed by the server or if, instead, it returned some kind of error. #!/usr/bin/env python from sys import argv from re import findall from socket import setdefaulttimeout from urllib2 [...]]]></description>
			<content:encoded><![CDATA[<p>This a basic script I have created to crawl an xml sitemap file (does not support nested sitemaps). It will report if the request was successfully processed by the server or if, instead, it returned some kind of error.</p>
<div class="geshi no python">
<ol>
<li class="li1">
<div class="de1"><span class="co1">#!/usr/bin/env python</span></div>
</li>
<li class="li1">
<div class="de1"><span class="kw1">from</span> <span class="kw3">sys</span> <span class="kw1">import</span> argv</div>
</li>
<li class="li1">
<div class="de1"><span class="kw1">from</span> <span class="kw3">re</span> <span class="kw1">import</span> findall</div>
</li>
<li class="li1">
<div class="de1"><span class="kw1">from</span> <span class="kw3">socket</span> <span class="kw1">import</span> setdefaulttimeout</div>
</li>
<li class="li1">
<div class="de1"><span class="kw1">from</span> <span class="kw3">urllib2</span> <span class="kw1">import</span> Request, urlopen</div>
</li>
<li class="li1">
<div class="de1"><span class="kw1">from</span> <span class="kw3">datetime</span> <span class="kw1">import</span> <span class="kw3">datetime</span></div>
</li>
<li class="li1">
<div class="de1">&nbsp;</div>
</li>
<li class="li1">
<div class="de1"><span class="co1"># Initialization</span></div>
</li>
<li class="li1">
<div class="de1">procId = argv<span class="br0">&#91;</span><span class="nu0">2</span><span class="br0">&#93;</span></div>
</li>
<li class="li1">
<div class="de1">sitemapUrl = argv<span class="br0">&#91;</span><span class="nu0">1</span><span class="br0">&#93;</span></div>
</li>
<li class="li1">
<div class="de1"><span class="kw1">print</span> <span class="st0">&#39;[%s]&#39;</span><span class="sy0">%</span>procId, <span class="st0">&quot;Crawling sitemap:&quot;</span>, sitemapUrl</div>
</li>
<li class="li1">
<div class="de1">&nbsp;</div>
</li>
<li class="li1">
<div class="de1"><span class="co1"># Test url</span></div>
</li>
<li class="li1">
<div class="de1"><span class="kw1">def</span> testURL<span class="br0">&#40;</span>url<span class="br0">&#41;</span>:</div>
</li>
<li class="li1">
<div class="de1">&nbsp; start = <span class="kw3">datetime</span>.<span class="me1">now</span><span class="br0">&#40;</span><span class="br0">&#41;</span></div>
</li>
<li class="li1">
<div class="de1">&nbsp; msg = <span class="st0">&#39;&#39;</span></div>
</li>
<li class="li1">
<div class="de1">&nbsp; <span class="kw3">code</span> = <span class="nu0">-1</span></div>
</li>
<li class="li1">
<div class="de1">&nbsp; req &nbsp;= Request<span class="br0">&#40;</span>url<span class="br0">&#41;</span></div>
</li>
<li class="li1">
<div class="de1">&nbsp;</div>
</li>
<li class="li1">
<div class="de1">&nbsp; <span class="kw1">try</span>:</div>
</li>
<li class="li1">
<div class="de1">&nbsp; &nbsp; response = urlopen<span class="br0">&#40;</span>req<span class="br0">&#41;</span></div>
</li>
<li class="li1">
<div class="de1">&nbsp; &nbsp; <span class="kw3">code</span> = response.<span class="kw3">code</span></div>
</li>
<li class="li1">
<div class="de1">&nbsp; <span class="kw1">except</span> <span class="kw2">IOError</span>, e:</div>
</li>
<li class="li1">
<div class="de1">&nbsp; &nbsp; <span class="kw1">if</span> <span class="kw2">hasattr</span><span class="br0">&#40;</span>e, <span class="st0">&#39;reason&#39;</span><span class="br0">&#41;</span>:</div>
</li>
<li class="li1">
<div class="de1">&nbsp; &nbsp; &nbsp; msg = <span class="st0">&#39;[Error: %s]&#39;</span> <span class="sy0">%</span> e.<span class="me1">reason</span></div>
</li>
<li class="li1">
<div class="de1">&nbsp; &nbsp; <span class="kw1">elif</span> <span class="kw2">hasattr</span><span class="br0">&#40;</span>e, <span class="st0">&#39;code&#39;</span><span class="br0">&#41;</span>:</div>
</li>
<li class="li1">
<div class="de1">&nbsp; &nbsp; &nbsp; msg = <span class="st0">&#39;[Error: %s]&#39;</span> <span class="sy0">%</span> e.<span class="kw3">code</span></div>
</li>
<li class="li1">
<div class="de1">&nbsp;</div>
</li>
<li class="li1">
<div class="de1">&nbsp; delta = <span class="kw3">datetime</span>.<span class="me1">now</span><span class="br0">&#40;</span><span class="br0">&#41;</span> &#8211; start</div>
</li>
<li class="li1">
<div class="de1">&nbsp; <span class="kw1">print</span> <span class="st0">&#39;[%02s]&#39;</span><span class="sy0">%</span>procId, <span class="st0">&#39;[%d]&#39;</span><span class="sy0">%</span><span class="kw3">code</span>, <span class="st0">&#39;[%03dms]&#39;</span><span class="sy0">%</span><span class="br0">&#40;</span>delta.<span class="me1">microseconds</span>/<span class="nu0">1000</span><span class="br0">&#41;</span>, msg, <span class="st0">&#39;&gt;&gt;&#39;</span>, url</div>
</li>
<li class="li1">
<div class="de1">&nbsp; <span class="kw1">return</span></div>
</li>
<li class="li1">
<div class="de1">&nbsp;</div>
</li>
<li class="li1">
<div class="de1"><span class="co1"># Load sitemap and process</span></div>
</li>
<li class="li1">
<div class="de1">req = Request<span class="br0">&#40;</span>sitemapUrl<span class="br0">&#41;</span></div>
</li>
<li class="li1">
<div class="de1">htmlSource = urlopen<span class="br0">&#40;</span>req<span class="br0">&#41;</span>.<span class="me1">read</span><span class="br0">&#40;</span><span class="br0">&#41;</span></div>
</li>
<li class="li1">
<div class="de1">linksList = findall<span class="br0">&#40;</span><span class="st0">&#39;&lt;loc&gt;(.*?)&lt;/loc&gt;&#39;</span>, htmlSource<span class="br0">&#41;</span></div>
</li>
<li class="li1">
<div class="de1"><span class="kw1">print</span> <span class="kw2">len</span><span class="br0">&#40;</span>linksList<span class="br0">&#41;</span>, <span class="st0">&quot;links found.&quot;</span></div>
</li>
<li class="li1">
<div class="de1">&nbsp;</div>
</li>
<li class="li1">
<div class="de1"><span class="kw1">for</span> link <span class="kw1">in</span> linksList:</div>
</li>
<li class="li1">
<div class="de1">&nbsp; testURL<span class="br0">&#40;</span>link<span class="br0">&#41;</span></div>
</li>
</ol>
</div>
<p>The script expects 2 parameters, the url for the xml sitemap and a identifier that will be printed to the log.</p>
<p> It is not very fast, but you can easily run multiple instances from the command line:</p>
<div class="geshi no bash">
<ol>
<li class="li1">
<div class="de1">.<span class="sy0">/</span>sitemap_crawler.py http:<span class="sy0">//</span>example.com<span class="sy0">/</span>sitemap.xml <span class="nu0">1</span> <span class="sy0">&amp;</span></div>
</li>
<li class="li1">
<div class="de1">.<span class="sy0">/</span>sitemap_crawler.py http:<span class="sy0">//</span>example.com<span class="sy0">/</span>sitemap.xml <span class="nu0">2</span> <span class="sy0">&amp;</span></div>
</li>
<li class="li1">
<div class="de1">.<span class="sy0">/</span>sitemap_crawler.py http:<span class="sy0">//</span>example.com<span class="sy0">/</span>sitemap.xml <span class="nu0">3</span> <span class="sy0">&amp;</span></div>
</li>
<li class="li1">
<div class="de1">.<span class="sy0">/</span>sitemap_crawler.py http:<span class="sy0">//</span>example.com<span class="sy0">/</span>sitemap.xml <span class="nu0">4</span> <span class="sy0">&amp;</span></div>
</li>
<li class="li1">
<div class="de1">.<span class="sy0">/</span>sitemap_crawler.py http:<span class="sy0">//</span>example.com<span class="sy0">/</span>sitemap.xml <span class="nu0">5</span> <span class="sy0">&amp;</span></div>
</li>
</ol>
</div>
<p>Enjoy!</p>
<h3  class="related_post_title">Related Posts:</h3><ul class="related_post"><li>August 6, 2011 -- <a href="http://dev.enekoalonso.com/2011/08/06/python-script-remove-empty-folders/" title="Python script: remove empty folders">Python script: remove empty folders</a> (1)</li><li>October 12, 2011 -- <a href="http://dev.enekoalonso.com/2011/10/12/command-line-scripting-with-node-js/" title="Command line scripting with Node.js">Command line scripting with Node.js</a> (0)</li><li>May 18, 2011 -- <a href="http://dev.enekoalonso.com/2011/05/18/location-hash-is-dead-on-html5-browsers/" title="location.hash is dead (on html5 browsers)">location.hash is dead (on html5 browsers)</a> (0)</li><li>August 18, 2010 -- <a href="http://dev.enekoalonso.com/2010/08/18/little-tricks-repeating-strings-in-javascript-python/" title="Little tricks: repeating strings in Javascript &#038; Python">Little tricks: repeating strings in Javascript &#038; Python</a> (1)</li></ul>]]></content:encoded>
			<wfw:commentRss>http://dev.enekoalonso.com/2009/09/09/crawling-sitemaps-with-python/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
		</item>
	</channel>
</rss>

