#!/usr/bin/perl # # Usage: download_wget.pl URL [debug] # # Where URL is a URL to the HEASARC FTP area # in the URL e.g., https://heasarc.gsfc.nasa.gov/FTP/nicer/data/obs/2018_01/1050020180/ # # Version 1.0 J. Allen 2019 (LA/TM) # 1.001 Changed 'index' to 'index*' to eliminate all index.html creation (2020/06/26) # Check if any files are downloaded: report if nothing found (possible URL error) # 1.002 Modify te help (LA 22/09/2020) # 1.1 Correct for situation where a single file is requested: worked, but output # from this script then reported (falsely) that it failed (JA 2021/02/18) use strict; use warnings; my $command; my $status; my $VERSION = 'v1.1 (10 Feb 2021)'; # Give the help is the script is invoke with no argument or too many if (($#ARGV < 0) || ($#ARGV > 1)) { usage_message(); } # Given help if "download help" if ($ARGV[0] =~ /help/i) { usage_message(); } # Exit with usage message if given a non-HEASARC URL if ($ARGV[0] !~ /^https:\/\/heasarc\.gsfc\.nasa\.gov/) { print "ERROR: This is not a HEASARC server address\n\n"; usage_message(); } # Assign first command argument to $url my $url = $ARGV[0]; # Hidden debug feature invoked as "download_wget.pl url debug". Exit with help for anything else # Debug is shared with subroutines, so "our" instead of local "my" our $debug = 0; if ($#ARGV > 0) { if ($ARGV[1] =~ /debug/i) { $debug = 1; } else { usage_message(); } } # In debug mode, print the full URL provided if ($debug) { print " $VERSION\n" ; } if ($debug) { print " 1- $url \n" ; } # Find the part of the URL after the protocol. my $xurl = $url; $xurl =~ s,(^https?|ftp)://,,; # Look how many / my @flds = split("/", $xurl); if ($debug) { print " 3- $#flds \n"; } if ($debug) { print " 4- $flds[$#flds] \n"; } my $cut_count = $#flds - 1; if ($debug) { print " 5- cut-dir=$cut_count \n"; } if ($#flds < 3) { print "URL $url will try to download too much of the archive, exiting."; exit; } elsif ($#flds == 3) { print "WARNING! Valid URL, but you may be getting too much data...\n"; print "Please try again with a more specific pattern match request\n"; exit; } # Look for certain wildcards in URL ([(stuff)], ?, and *) # Call subroutine to deal with wildcards else proceed if ($url =~ /\*|\[.+\]|\?/) { wget_wildcard_search($url); } # Add a '/' to the end of the URL to test if this is directory, modify URL if directory unless ($url =~ /\/$/) { my $testurl = $url . '/'; # Silent URL test (--spider tests validity of URL, but does not download) $command = "wget --spider --quiet $testurl"; $status = system($command); if ($status == 0) { # It is a directory: change the URL to include the '/' $url = $testurl; if ($debug) { print " 2- $url \n";} } } # Run the download command (-q/--quiet unless debug is set) $command = "wget -q -nH -r -c -N --retr-symlinks -e robots=off -N -np -R \'index*\' " . "--cut-dirs=$cut_count $url"; if ($debug) { $command =~ s/^wget \-q/wget/; print "$command\n"; } else { print "Downloading $url\n"; } $status = system($command); wget_check_status($status); # Check if there were any files included, or just empty directories if ($status == 0) { print "Download complete.\n"; # Do a recursive search through all directories downloaded my $local_dir = $flds[$#flds]; my @files = `find $local_dir -type f`; if ($#files < 0) { print "Download found no files in $local_dir: were there no files in $url?\n"; } } else { print "Download failed: please check URL\n"; } #====== END MAIN ROUTINE ======== sub wget_check_status { my ($status) = @_; if ($status == -1) { print "$command failed to execute\n"; } elsif ($status & 127) { printf "wget died with signal %d, %s coredump\n", ($status & 127), ($status & 128) ? 'with' : 'without'; } } sub get_files_from_index { my ($indexfile) = @_; my @results = (); if (-e "$indexfile") { open (my $fh, '<', "$indexfile"); while (<$fh>) { unless (/^"); $file = substr($elements[1],$start_index+2,$stop_index-($start_index+2)); } push (@results, $file); } close $fh; } return \@results; } sub wget_wildcard_search { my ($url) = @_; my $status; my $command; my $recurse_limit = 5; # Look up to 5 directories down from URL in search my @files = (); # List where all the files matching the pattern will go my $testurl = $url; $testurl =~ s,^(https?|ftp)://,,; my $headstring = $1; # Split URL into directories and find the first one with a wildcard # $first_wild is the index of this first wildcard # $wild_count is the number of wildcards in the URLs: for multiple wildcards, # all files will be downloaded to the current directory # E.g. https://heasarc.gsfc.nasa.gov/FTP/nicer/data/obs/2019_01/*/auxil/*.mkf.gz # would put all the .mkf.gz files into the current directory # If there is only a single wildcard setting, maintain the directory structure # E.g. https://heasarc.gsfc.nasa.gov/FTP/maxi/data/obs/MJD58000/MJD5854[2-4] # would create MJD58542, MJD58543, and MJD58544 and populate those with all the # matching subdirectories my @fields = split('/',$testurl); my $first_wild = $#fields-1; my $wild_count = 0; for (my $i = $#fields; $i >= 0; $i--) { if ($fields[$i] =~ /\*|\[.+\]|\?/) { $first_wild = $i; $wild_count++; } } if ($first_wild <= 3) { print "Wildcard in URL $url too close to root, not allowed\n"; exit; } my $current_wild = 999; my $previous_wild = 999; for (my $i = 0; $i <= $#fields; $i++) { if ($fields[$i] =~ /\*|\[.+\]|\?/) { $current_wild = $i; } if ($previous_wild == ($current_wild - 1)) { print "URL $url has two consecutive wildcards: not allowed (too much data)"; exit; } else { $previous_wild = $current_wild; } } # Convert unix-style wildcards into Perl regex for pattern matching # $search is the full pattern, @search is the pattern within each level # of the directories. # E.g. https://heasarc.gsfc.nasa.gov/FTP/nicer/data/obs/2018_09/*/auxil/ni*.att.gz # $search = '.*/auxil/ni*.att.gz' # @search_fields = ('.*','auxil','ni.*.att.gz') # @search_fields allows pattern matching within each directory my $search = ''; my @search_fields; for (my $i = 0; $i <= ($#fields-$first_wild); $i++) { my $temp = $fields[$i + $first_wild]; $temp =~ s/\?/\./; # Convert ? to . for Perl regex $temp =~ s/\*/\.\*/; # Convert * to .* for Perl regex push(@search_fields,$temp); $search .= $temp . '/'; } chop($search); # Complete regex style search string if ($debug) { print " search string - $search \n" ;} # Build a list of all files and directories that match the pattern # First, make the base URL my $tmpurl = $headstring . '://'; for (my $i = 0; $i < $first_wild; $i++) { $tmpurl .= $fields[$i] . '/'; } chop($tmpurl); my @search = ('/'); my @next_search = (); print "Building a list of all files and directories that match the pattern search\n"; my $searching = 1; while ($searching) { my $dir_count = 0; foreach my $dir (@search) { # List all files and directories in $dir in index.html, and # sort these into directories that match pattern and files that match # pattern and ignore all others. Directories that match pattern are # stored in @next_search and will be traversed to look for files in # the next step down the directory structure ($i+1). Continue until # running out of files or hit the recursion limit (limit on how far down the # directory structure to look) my $testurl = $tmpurl . $dir; if ($debug) { print "Searching $testurl\n"; } elsif ((($dir_count % 20) == 0) && ($dir_count != 0) && (scalar(@search) > 20)) { printf "%d of %d directories searched\n", $dir_count, scalar(@search); } my @search_depth = split('/',$dir); my $current_level = $#search_depth; if ($current_level < 0) { $current_level = 0; } $command = 'wget --no-parent --no-host-directories ' . "--quiet --retr-symlinks -e robots=off " . "--no-remove-listing $testurl"; if (-e "index.html") { unlink "index.html"; } $status = system($command); my $list = get_files_from_index("index.html"); unlink "index.html"; $dir_count++; # Check which items are files or directories, check if they match the # search pattern, and add any matching files to @files, and # any matching directories to @next_search to set up a search # (e.g. url is # https://heasarc.gsfc.nasa.gov/FTP/nicer/data/obs/2018_09/*/auxil/ni*.att.gz # all directories below 2018_09 will match, need to be searched for auxil, # and all files in auxil matching ni*.att.gz listed to @files) my $regex = '.*'; if ($current_level <= $#search_fields) { $regex = $search_fields[$current_level]; } my $next_dir = ''; my @new_files = (); my $next_level = $current_level + 1; if ($next_level <= $#search_fields) { $next_dir = $search_fields[$next_level] . '/'; # If the next directory has no wildcards, we can skip ahead in search # but if it has any special characters, perform a search through the # directory tree if ($next_dir !~ /^\w+\/$/) { $next_dir = ''; } } if (scalar (@$list == 0)) { print "Download failed: check URL\n"; exit; } foreach my $listing (@$list) { if ($listing =~ /\/$/) { # This is a directory: If match, push to next search if ($listing =~ /${regex}/) { push(@next_search, $dir . $listing . $next_dir); } } else { # This is a file: record the full URL if it matches the pattern if ($listing =~ /${regex}/) { push(@new_files, $testurl . $listing); } } } if (scalar(@new_files)) { if ($debug) { if (scalar(@new_files) == 1) { printf "Found %d file\n", scalar(@new_files); } else { printf "Found %d files\n", scalar(@new_files); } } } push(@files,@new_files); } # End loop through current directory: hand @next_search to @search, or end # search if no more directories to search were found if (scalar(@files)) { if (scalar(@files) == 1) { printf "Found %d file to download so far.\n", scalar(@files); } else { printf "Found %d files to download so far.\n", scalar(@files); } } @search = (); if ($#next_search != -1) { @search = @next_search; @next_search = (); printf "Found %d directories\n", scalar(@search); if (scalar(@search) > 100) { print "Please be patient: this may take a while\n"; } } else { $searching = 0; } # Terminate loop when no further matches are found } # End while searching loop print "Found all matching files, now downloading...\n"; my $cnt = 0; if ($#files < 0) { print "No files to download: check url\n"; exit; } foreach my $download (@files) { # Run WGET on all files that match pattern, then exit $command = "wget -q -nH -r -c -N --retr-symlinks -e robots=off -N -np -nd -R \'index\' " . sprintf("--cut-dirs=%d", $first_wild-1) . " $download"; if ($debug) { $command =~ s/^wget \-q/wget/; print "$command\n"; } else { if (($cnt % 20) == 0) { printf "%d of %d files downloaded.\n", $cnt, scalar(@files); } } $cnt++; if ($wild_count == 1) { $command =~ s/ -nd//; } # Retain directories if only one wildcard $status = system($command); wget_check_status($status); unless ($status == 0) { print "Download failed: check that $download is a valid address\n"; } } if (($cnt % 20) != 0) { printf "%d of %d downloaded.\n", $cnt, scalar(@files); } print "Downloads completed.\n"; exit; } # end wild card search sub usage_message { print << "EndOfHelp"; Usage : download_wget url Downloads data to the local computer from the HEASARC archive using wget command and a url. The possible type of downloads are : a) single directory corresponding to a specific observation b) range of directories corresponding to a range of observations c) single file d) type of files a) The command for a single directory is : > download_wget.pl https://heasarc.gsfc.nasa.gov/FTP/nicer/data/obs/2018_01/1050020180/ On the local computer the downloaded data are in the directory 1050020180/ maintaining with the archive structure. b) The command for a range of directories is : > download_wget.pl "https://heasarc.gsfc.nasa.gov/FTP/nicer/data/obs/2018_01/105002018[01]" On the local computer the downloaded data are in the directories 1050020180/ and 1050020181/. The archive structure is maintained within each directory. c) The command for a single file is : > download_wget.pl https://heasarc.gsfc.nasa.gov/FTP/nicer/data/obs/2018_01/1050020180/auxil/ni1050020180.att.gz On the local computer the file is downloaded in the directory where the script is invoked. d) The command for multiple files with the same pattern : > download_wget.pl "https://heasarc.gsfc.nasa.gov/FTP/nicer/data/obs/2018_01/*/auxil/ni*.att.gz" On the local computer the files are downloaded in the directory where the script is invoked. The archive structure is not maintained. Within a given url wildcard are allowed to search specific files that are located in different directories with a well defined pattern. The wildcard completion allowed are * and [ ]. A maximum of two non consegutive wildcards are allowed for url. The url needs to be specified with double quotes if contains * or [ ]. If the transfer is interruped, the user may restart the same command in the same directory and only the remaining files are downloaded. NOTE: The script does not allow to transfer data from an entire mission archive (e.g. https://heasarc.gsfc.nasa.gov/FTP/nicer) nor from two subsequent wild card directories (e.g. "https://heasarc.gsfc.nasa.gov/FTP/nicer/data/obs/2018_01/*/*/ni*.att.gz"). EndOfHelp exit; } # # #Allowed # > download_wget https://heasarc.gsfc.nasa.gov/FTP/nicer/data/obs/2018_01/1050020180/ # > download_wget "https://heasarc.gsfc.nasa.gov/FTP/nicer/data/obs/2018_01/105002018[01]" # > download_wget "https://heasarc.gsfc.nasa.gov/FTP/nicer/data/obs/2018_01/*/auxil/ni*.att.gz" # # Not allowed and need a more specific pattern # > download_wget "https://heasarc.gsfc.nasa.gov/FTP/nicer/data/obs/2018_01/*/*/ni*.att.gz" # > download_wget "https://heasarc.gsfc.nasa.gov/FTP/nicer/data/