webcrawler actually working

Home   »   webcrawler actually working

# webcrawler 1.1  created 2021.05.04 last updated 2021.05.05 j.thomas
# starts with a basepage and grabs links as it finds them in each page while noting instances of search term appearences in pages and links
# min ruby: 0.0.0
require 'uri';require 'open-uri'
class Web_Crawler
  def initialize
	@case_sensitivity=false ## weather capitolization is ignored when looking for search term
    @counter=0 ##number of loops executed\pages crawled
	@invalid_counter=0 ## number of invalid pages checked
    @already_searched=[] ## links to not crawl again
	@search_term=''
	@search_que=[] ## a list of links to crawl
	@matching_pages=[] ## a list of pages that matched the search term
	@matching_links=[] ## links that directly contain the search term
	@base_page='' ##starting page

  end
  
  def start_crawling(base_page,term)
    if @case_sensitivity==false;@search_term=term.to_s.downcase;else;@search_term=term.to_s;end
	@search_que=[base_page]
	unless @crawling ; crawl_loop ; end
  end
  
  def stop_crawling
    @crawling=false
  end
  
  def save_crawl
    dat=[@counter.to_s,@invalid_counter.to_s,@case_sensitivity.to_s,@already_searched.to_s,@search_que.to_s,@matching_pages.to_s,@matching_links.to_s,@search_term.to_s].to_s    
    f=File.open(Dir.getwd+"/crawl.dat","w");f.write(dat);f.close
	puts "Saved this point in the crawl to file."
  end
  def resume_saved_crawl ## to resume a crawl right after calling stop just call the crawl_loop again
    if File.exist?(Dir.getwd+"/crawl.dat")
	  f=File.open(Dir.getwd+"/crawl.dat","r");dat=f.read.to_s;f.close
	  dat=eval(dat)
	  @counter=dat[0].to_i ; @invalid_counter=dat[1].to_i ; if dat[2]=="true";@case_sensitivity=true;else;@case_sensitivity=false;end
	  @already_searched=eval(dat[3].to_s)
	  @search_que=eval(dat[4].to_s)
	  @matching_pages = eval(dat[5].to_s)
	  @matching_links = eval(dat[6].to_s)
	  if @case_sensitivity==false;@search_term = dat[7].to_s.downcase;else;@search_term=dat[7].to_s;end
	  puts "Preparing to resume crawl..."
	  crawl_loop
	else;return []
	end	
  end
  
  def crawl_loop ## base_page=String 'http://url.page.com',term=String 'gold prices in germany' 
    @crawling=true ## a variable to keep the loop running	  
	while @crawling
	
      #sleep 1 # can be used to prevent annoyance to servers being crawled
	  @[email protected]+1 ## counts pages checked
	  
	  ## set next page to check                                ## STEERING GOES HERE kinda 
	  [email protected]_que[0];@search_que.delete_at(0)

	  ## print info to consol
	  puts "Searching: " + current.to_s
	  puts @counter.to_s+"/"[email protected]_que.length.to_s
	  
	  ## get page contents
	  p=get_page(current)
	  
	  ## check page
	  if p == '' ;@invalid_counter+=1; puts "Invalid link checked.("[email protected]_counter.to_s+")" ## if invalid skip
	  else ## if valid page check for search term and save page if found
        ## write page address to file	   
		begin;f=File.open(Dir.getwd+"/dat/a_traveled_links.txt","a");f.write(current.to_s+"\n");f.close;rescue;;end
		## extract links from page
		l=get_links(p)
        ## filter out links that do not have an http/https header
		nl=[];l.each do |i| ; begin;if i.to_s[0..3].downcase=="http";nl<0;puts "Found "+l.length.to_s+" links on this page."  ## print found links to screen   

		  ## filter links searched before
		  nl=[];l.each {|i| if @already_searched.include?(i)==false;nl<

Leave a Reply

Your email address will not be published. Required fields are marked *