#!/usr/bin/env ruby # $Id: btl.rb,v 1.5 2005/07/04 19:28:02 madleser Exp $ # Synopsis:: crawls bittorrent sites # License:: Public Domain (suggestions, enhancements and bugfixes welcome though) # # == General notes # The config file defaults to "~/.btlrc" and the cache/database to # "~/.btl_dump". Windows users must expand this manually or set the HOME # environment variable. # # == Usage examples # Show new files (as in "not already in database") from lunar: # btl.rb new http://a.scarywater.net/lunar/ # # Show new files from lunar matching given regexps. Note that only filenames # are probed. It's done after unescaping them and before spaces are substituted # for "%20" again. Completely escaping the link is tedious if it contains a # path so I decided to be lazy: # btl.rb -f Bravo,DVD,"Grenadier[^\[]*08" new http://a.scarywater.net/lunar/ # # If queue_cmd is set (see the sample config), %url% is replaced by the actual # url and the command evaluated. This way, you can automatically enqueue new # files in your bt client: # btl.rb -q Bravo,DVD,"Grenadier[^\[]*08" new http://a.scarywater.net/lunar/ # # Show cached database for a specified site, cmdline switch semantics also apply: # btl.rb old http://a.scarywater.net/lunar/ # # Display new *and* old entries: # btl.rb all http://a.scarywater.net/lunar/ # # Crawl multiple sites: # btl.rb new http://a.scarywater.net/lunar/ http://a.scarywater.net/aone/ [...] # # === Aliases and Groups (if using a configuration file) # See the config section below for details. # # Do stuff. yarrrrr~: # btl.rb new lunar aone # # Passing no arguments is the equivalent of: # btl.rb new # # Groups are expanded (hentai = [ass-hentai, h-bla]): # btl.rb new lunar aone hentai # will become: # btl.rb new lunar aone ass-hentai h-bla # # == Sample configuration file # I like YAML. # # define aliases for sites # sites: # lunar: # url: "http://a.scarywater.net/lunar/" # aone: # url: "http://a.scarywater.net/aone/" # some_obscure_site: # url: "http://ajsdjd.org/" # re: # if the site requires special regexp love # link: 'href="([^"]*%pattern%[^"]*\.torrent)"' # bla zomg # # ^^^^^^^^^^^ # desc: 'href="[^"]*%pattern%[^"]*\.torrent"[^>]*\>([^>]*)\' # bla zomg # # ^^^^^^^^ # ass-hentai: # url: "http://aasasda.net/" # h-bla: # url: "http://sdgsgsg.org" # # # group several aliases together # group: # hentai: # - ass-hentai # - h-bla # # # these are what %pattern% (see above) is substituted for. "all" and the # # site specific patterns are concatenated. # patterns: # all: # - "macross" # - "Miteru" # - "Moon" # - "baby" # lunar: # - "Bravo" # # # self-explanatory. mind you that the filename is escaped at this # # point (spaces => "%20"). # queue_patterns: # all: # - !ruby/regexp "/DVD/" # lunar: # - !ruby/regexp "/Bravo/" # # # command to run to enqueue stuff. # queue_cmd: "btqueue.py add '%url%'" # # # this is evaluated after the config file is sourced. # code: > # def print(site, entry) # puts "[#{site}] \e[1m#{entry['desc']}\e[0m: " + "\e[33m#{entry['url']}\e[0m" + "\e[32m#{entry['link']}\e[0m" # end # # # these are evaluated at various points in the program. with this, you # # can (eg.) set up hooks for whatever you can think of. # callback: # before_loop: > # before the great loop of print() and queuing goodness. # # possibilities of usage include setting up database access # # or messaging an application or [...]. # puts "before" # after_loop: > # after it. use this to clean up the mess your before_loop callback left behind. # puts "after" # # == TODO # - XXX # - add switch for not saving to dump # - enable queuing only when new stuff shows up? prevents accidental queueing when searching cache. - OR - keep track of downloads # - add dates etc. -> today's/yesterday's torrents, ... $: << File.expand_path("~/mem/dev/testing/") require 'linkharvester' require 'yaml' require 'optparse' # basic print function def print(site, entry) puts "[#{site}] #{entry['desc']}: #{entry['url'] + entry['link']}" end # XXX: windows users must expand this manually or set the HOME environment variable DUMPFILE, CONF = File.expand_path("~/.btl_dump"), File.expand_path("~/.btlrc") USAGE = "Usage: #$0 [options] [show [[new|old|all] [sites]]]" conf = YAML.load(open(CONF)) if File.readable? CONF conf ||= {} sites = conf['sites'] || {} patterns = conf['patterns'] || {"all" => ["."]} patterns.default= [] queue_patterns = conf['queue_patterns'] || {"all" => []} queue_patterns.default = [] queue_cmd = conf['queue_cmd'] group = conf['group'] || [] callback = conf['callback'] || {} mode = "new" eval conf['code'].to_s ARGV.options do |oparser| oparser.banner = USAGE oparser.on("--find=pattern1,patternN", "-f pattern1,patternN", Array, "Only display entries matching the given regexps. replaces patterns['all'].") do |list| patterns = {'all' => list} end oparser.on("--queue=pattern1,patternN", "-q pattern1,patternN", Array, "Automatically queue filenames matchching the given regexps. complements queue_patterns['all'].") do |list| queue_patterns['all'] += list.collect {|i| i = Regexp.new i.to_s} end oparser.on("--queuecmd STRING", "-c STRING", String, "Command to execute to enqueue stuff. '%url%' is replaced by the actual URL.") do |arg| queue_cmd = arg end oparser.parse! end unless ARGV.empty? mode = ARGV.shift # fetch environment vars ARGV += ENV["BTL"] if ENV["BTL"] # all space-seperated strings after the mode argument (see above) are # interpreted as names/urls/groups. overrides the config. it's useful if you # want to crawl only specific sites. unless ARGV.empty? # expand groups ARGV.collect do |e| if group.include? e ARGV.concat group[e] ARGV.delete e end end # remove site aliases not present in ARGV sites.delete_if {|k, v| not ARGV.include?(k)} # convert http links to the internal format # XXX: pointing to local files won't work (ARGV.collect {|y| {y => {"url" => y}} if y =~ /^http/}).each do |e| sites.merge! e if e end end end oldlinks = YAML.load(open(DUMPFILE)) if File.readable? DUMPFILE oldlinks ||= Hash.new([]) links = nil # XXX: incomplete arglist crawler = LinkHarvester.new(:sites => sites, :patterns => patterns, :oldlinks => oldlinks) case mode when "all" links = crawler.crawl when "old" s = sites.keys links = oldlinks.reject do |k,v| not s.include? k end else # display newest links links = crawler.diff end # process/print all entries in new_links links.each_pair do |k, v| v.each do |e| print(k, e) # do the queuing dance. matching is done by comparing all patterns against the filename. if queue_cmd and (queue_patterns['all'].any? {|p| e['link'] =~ p} or queue_patterns[k].any? {|p| e['link'] =~ p}) system queue_cmd.gsub("%url%", e['link']) next end end end # write links back to DUMPFILE if the directory is writable (in case you use a # guest account or something). ignore http-links given on the cmdline. unless mode == "old" open(DUMPFILE, "w+") do |f| links.each_pair do |k, v| oldlinks[k] = (oldlinks[k].to_a + v).uniq unless k =~ /^http/ end f << oldlinks.to_yaml end if File.writable?(File.dirname(DUMPFILE)) end