Mirror of metasploit
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

crawler.rb 9.0KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351
  1. # -*- coding: binary -*-
  2. module Msf
  3. ###
  4. #
  5. # This module provides methods for implementing a web crawler
  6. #
  7. ###
  8. module Auxiliary::HttpCrawler
  9. include ::Msf::Auxiliary::Report
  10. def initialize(info = {})
  11. super
  12. register_options(
  13. [
  14. Opt::RHOST,
  15. Opt::RPORT(80),
  16. OptString.new('VHOST', [ false, "HTTP server virtual host" ]),
  17. OptString.new('URI', [ true, "The starting page to crawl", "/"]),
  18. Opt::Proxies,
  19. OptInt.new('MAX_PAGES', [ true, 'The maximum number of pages to crawl per URL', 500]),
  20. OptInt.new('MAX_MINUTES', [ true, 'The maximum number of minutes to spend on each URL', 5]),
  21. OptInt.new('MAX_THREADS', [ true, 'The maximum number of concurrent requests', 4]),
  22. OptString.new('USERNAME', [false, 'The HTTP username to specify for authentication']),
  23. OptString.new('PASSWORD', [false, 'The HTTP password to specify for authentication']),
  24. OptString.new('DOMAIN', [ true, 'The domain to use for windows authentication', 'WORKSTATION']),
  25. OptBool.new('SSL', [ false, 'Negotiate SSL/TLS for outgoing connections', false])
  26. ], self.class
  27. )
  28. register_advanced_options(
  29. [
  30. OptBool.new('DirBust', [ false, 'Bruteforce common URL paths', true]),
  31. OptInt.new('RequestTimeout', [false, 'The maximum number of seconds to wait for a reply', 15]),
  32. OptInt.new('RedirectLimit', [false, 'The maximum number of redirects for a single request', 5]),
  33. OptInt.new('RetryLimit', [false, 'The maximum number of attempts for a single request', 5]),
  34. OptString.new('UserAgent', [true, 'The User-Agent header to use for all requests',
  35. "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
  36. ]),
  37. OptString.new('BasicAuthUser', [false, 'The HTTP username to specify for basic authentication']),
  38. OptString.new('BasicAuthPass', [false, 'The HTTP password to specify for basic authentication']),
  39. OptString.new('HTTPAdditionalHeaders', [false, "A list of additional headers to send (separated by \\x01)"]),
  40. OptString.new('HTTPCookie', [false, "A HTTP cookie header to send with each request"]),
  41. Opt::SSLVersion
  42. ], self.class
  43. )
  44. register_autofilter_ports([ 80, 8080, 443, 8000, 8888, 8880, 8008, 3000, 8443 ])
  45. register_autofilter_services(%W{ http https })
  46. begin
  47. require 'anemone'
  48. @anemone_loaded = true
  49. rescue ::Exception => e
  50. @anemone_loaded = false
  51. @anemone_error = e
  52. end
  53. end
  54. def setup
  55. raise RuntimeError, "Could not load Anemone/Nokogiri: #{@anemone_error}" if not @anemone_loaded
  56. super
  57. end
  58. def cleanup
  59. if @crawler
  60. @crawler.shutdown rescue nil
  61. @crawler = nil
  62. end
  63. super
  64. end
  65. ##
  66. #
  67. # Crawler methods and accessors
  68. #
  69. ##
  70. # A target object for tracking URLs
  71. class WebTarget < ::Hash
  72. def to_url
  73. proto = self[:ssl] ? "https" : "http"
  74. host = self[:vhost] ? self[:vhost] : self[:host]
  75. if Rex::Socket.is_ipv6?(host)
  76. host = "[#{host}]"
  77. end
  78. "#{proto}://#{host}:#{self[:port]}#{self[:path]}"
  79. end
  80. end
  81. # A custom error to signify we hit the page request cap
  82. class MaximumPageCount < ::RuntimeError
  83. end
  84. # Some accessors for stat tracking
  85. attr_accessor :targets
  86. attr_accessor :url_count, :url_total, :form_count, :request_count
  87. # Entry point for the crawler code
  88. def run
  89. self.request_count = 0
  90. self.form_count = 0
  91. self.url_count = 0
  92. self.url_total = 1
  93. path,query = datastore['URI'].split('?', 2)
  94. query ||= ""
  95. t = WebTarget.new
  96. t.merge!({
  97. :vhost => vhost,
  98. :host => rhost,
  99. :port => rport,
  100. :ssl => ssl,
  101. :path => path,
  102. :query => query,
  103. :info => ""
  104. })
  105. if datastore['USERNAME'] and datastore['USERNAME'] != ''
  106. t[:username] = datastore['USERNAME'].to_s
  107. t[:password] = datastore['PASSWORD'].to_s
  108. t[:domain] = datastore['DOMAIN'].to_s
  109. end
  110. if datastore['HTTPCookie']
  111. t[:cookies] = {}
  112. datastore['HTTPCookie'].to_s.split(';').each do |pair|
  113. k,v = pair.strip.split('=', 2)
  114. next if not v
  115. t[:cookies][k] = v
  116. end
  117. end
  118. if datastore['HTTPAdditionalHeaders']
  119. t[:headers] = datastore['HTTPAdditionalHeaders'].to_s.split("\x01").select{|x| x.to_s.length > 0}
  120. end
  121. t[:site] = report_web_site(:wait => true, :host => t[:host], :port => t[:port], :vhost => t[:vhost], :ssl => t[:ssl])
  122. print_status("Crawling #{t.to_url}...")
  123. begin
  124. @current_vhost = t[:vhost]
  125. @current_site = t[:site]
  126. ::Timeout.timeout(max_crawl_time) { crawl_target(t) }
  127. rescue ::Timeout::Error
  128. print_error("Crawl of #{t.to_url} has reached the configured timeout")
  129. ensure
  130. @current_vhost = nil
  131. end
  132. print_status("Crawl of #{t.to_url} complete")
  133. end
  134. def get_connection_timeout
  135. datastore['RequestTimeout']
  136. end
  137. def max_page_count
  138. datastore['MAX_PAGES']
  139. end
  140. def max_crawl_time
  141. datastore['MAX_MINUTES'] * 60.0
  142. end
  143. def max_crawl_threads
  144. datastore['MAX_THREADS']
  145. end
  146. def dirbust?
  147. datastore['DirBust']
  148. end
  149. # Scrub links that end in these extensions. If more or less is
  150. # desired by a particular module, this should get redefined.
  151. def get_link_filter
  152. /\.(js|png|jpe?g|bmp|gif|swf|jar|zip|gz|bz2|rar|pdf|docx?|pptx?)$/i
  153. end
  154. def focus_crawl(page)
  155. page.links
  156. end
  157. def crawl_target(t)
  158. cnt = 0
  159. opts = crawler_options(t)
  160. url = t.to_url
  161. @crawler = ::Anemone::Core.new([url], opts)
  162. @crawler.on_every_page do |page|
  163. cnt += 1
  164. self.request_count += 1
  165. # Extract any interesting data from the page
  166. crawler_process_page(t, page, cnt)
  167. # Blow up if we hit our maximum page count
  168. if cnt >= max_page_count
  169. print_error("Maximum page count reached for #{url}")
  170. raise MaximumPageCount, "Maximum page count reached"
  171. end
  172. end
  173. # Skip link processing based on a regular expression
  174. @crawler.skip_links_like(
  175. get_link_filter
  176. )
  177. # Focus our crawling on interesting, but not over-crawled links
  178. @crawler.focus_crawl do |page|
  179. focus_crawl(page)
  180. end
  181. begin
  182. @crawler.run
  183. rescue MaximumPageCount
  184. # No need to print anything else
  185. rescue ::Timeout::Error
  186. # Bubble this up to the top-level handler
  187. raise $!
  188. rescue ::Exception => e
  189. # Ridiculous f'ing anonymous timeout exception which I've no idea
  190. # how it comes into existence.
  191. if e.to_s =~ /execution expired/
  192. raise ::Timeout::Error
  193. else
  194. print_error("Crawler Exception: #{url} #{e} #{e.backtrace}")
  195. end
  196. ensure
  197. @crawler.shutdown rescue nil
  198. @crawler = nil
  199. end
  200. end
  201. # Specific module implementations should redefine this method
  202. # with whatever is meaningful to them.
  203. def crawler_process_page(t, page, cnt)
  204. msg = "[#{"%.5d" % cnt}/#{"%.5d" % max_page_count}] #{page.code || "ERR"} - #{@current_site.vhost} - #{page.url}"
  205. case page.code
  206. when 301,302
  207. if page.headers and page.headers["location"]
  208. print_status(msg + " -> " + page.headers["location"].to_s)
  209. else
  210. print_status(msg)
  211. end
  212. when 500...599
  213. # XXX: Log the fact that we hit an error page
  214. print_good(msg)
  215. when 401,403
  216. print_good(msg)
  217. when 200
  218. print_status(msg)
  219. when 404
  220. print_error(msg)
  221. else
  222. print_error(msg)
  223. end
  224. end
  225. def crawler_options(t)
  226. opts = {}
  227. opts[:user_agent] = datastore['UserAgent']
  228. opts[:verbose] = false
  229. opts[:threads] = max_crawl_threads
  230. opts[:obey_robots_txt] = false
  231. opts[:redirect_limit] = datastore['RedirectLimit']
  232. opts[:retry_limit] = datastore['RetryLimit']
  233. opts[:accept_cookies] = true
  234. opts[:depth_limit] = false
  235. opts[:skip_query_strings] = false
  236. opts[:discard_page_bodies] = true
  237. opts[:framework] = framework
  238. opts[:module] = self
  239. opts[:timeout] = get_connection_timeout
  240. opts[:dirbust] = dirbust?
  241. if (t[:headers] and t[:headers].length > 0)
  242. opts[:inject_headers] = t[:headers]
  243. end
  244. if t[:cookies]
  245. opts[:cookies] = t[:cookies]
  246. end
  247. opts[:username] = t[:username] || ''
  248. opts[:password] = t[:password] || ''
  249. opts[:domain] = t[:domain] || 'WORKSTATION'
  250. opts
  251. end
  252. ##
  253. #
  254. # Wrappers for getters
  255. #
  256. ##
  257. #
  258. # Returns the target host
  259. #
  260. def rhost
  261. datastore['RHOST']
  262. end
  263. #
  264. # Returns the remote port
  265. #
  266. def rport
  267. datastore['RPORT']
  268. end
  269. #
  270. # Returns the VHOST of the HTTP server.
  271. #
  272. def vhost
  273. datastore['VHOST'] || datastore['RHOST']
  274. end
  275. #
  276. # Returns the boolean indicating SSL
  277. #
  278. def ssl
  279. ((datastore.default?('SSL') and rport.to_i == 443) or datastore['SSL'])
  280. end
  281. #
  282. # Returns the string indicating SSL version
  283. #
  284. def ssl_version
  285. datastore['SSLVersion']
  286. end
  287. #
  288. # Returns the configured proxy list
  289. #
  290. def proxies
  291. datastore['Proxies']
  292. end
  293. end
  294. end