build/download/http_downloader.py

   1 #! /usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 #   COPYRIGHT NOTICE STARTS HERE
   5
   6 #   Copyright 2019 © Samsung Electronics Co., Ltd.
   7 #
   8 #   Licensed under the Apache License, Version 2.0 (the "License");
   9 #   you may not use this file except in compliance with the License.
  10 #   You may obtain a copy of the License at
  11 #
  12 #       http://www.apache.org/licenses/LICENSE-2.0
  13 #
  14 #   Unless required by applicable law or agreed to in writing, software
  15 #   distributed under the License is distributed on an "AS IS" BASIS,
  16 #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  17 #   See the License for the specific language governing permissions and
  18 #   limitations under the License.
  19
  20 #   COPYRIGHT NOTICE ENDS HERE
  21
  22 import argparse
  23 import datetime
  24 import logging
  25 import os
  26 import sys
  27 import timeit
  28
  29 import requests
  30 from retrying import retry
  31
  32 import http_file
  33 from concurrent_downloader import ConcurrentDownloader
  34
  35 log = logging.getLogger(__name__)
  36
  37
  38 class HttpDownloader(ConcurrentDownloader):
  39     def __init__(self, *list_args, workers=None):
  40         super().__init__('http files', *list_args, workers=workers)
  41
  42     @property
  43     def check_table(self):
  44         """
  45         Table with information what items from lists are downloaded
  46         """
  47         self.missing()
  48         header = ['Name', 'Downloaded']
  49         return self._check_table(header, {'Name': 'l'},
  50                                  ((item, item not in self._missing) for item
  51                                   in self._data_list))
  52
  53     @staticmethod
  54     def _make_get_request(url):
  55         """
  56         Run http get request
  57         :param url: url to reqeuest
  58         :return: requests.Response
  59         """
  60         req = requests.get(url)
  61         req.raise_for_status()
  62         return req
  63
  64     def _is_missing(self, item):
  65         """
  66         Check if item is missing (not downloaded)
  67         :param item: item to check
  68         :return: boolean
  69         """
  70         return not os.path.isfile(
  71             '{}/{}'.format(self._data_list[item], item.rsplit('//')[-1]))
  72
  73     @retry(stop_max_attempt_number=5, wait_fixed=2000)
  74     def _get_file(self, file_uri):
  75         """
  76         Get http file from uri
  77         :param file_uri: uri of the file
  78         :return: file content
  79         """
  80         if not file_uri.startswith('http'):
  81             file_uri = 'http://' + file_uri
  82         file_req = self._make_get_request(file_uri)
  83         return file_req.content
  84
  85     def _download_item(self, item):
  86         """
  87         Download http file
  88         :param item: http file to be downloaded (tuple: (uri, dst_dir))
  89         """
  90         log.info('Downloading: {}'.format(item[0]))
  91         dst_path = '{}/{}'.format(item[1], item[0].rsplit('//')[-1])
  92         try:
  93             f = http_file.HttpFile(item[0], self._get_file(item[0]), dst_path)
  94             f.save_to_file()
  95         except Exception as err:
  96             log.exception('Error downloading: {}: {}'.format(item[0], err))
  97             if os.path.isfile(dst_path):
  98                 os.remove(dst_path)
  99             raise err
 100         log.info('Downloaded: {}'.format(f.name))
 101
 102
 103 def run_cli():
 104     """
 105     Run as cli tool
 106     """
 107     parser = argparse.ArgumentParser(description='Download http files from list')
 108     parser.add_argument('file_list', metavar='file-list',
 109                         help='File with list of http files to download')
 110     parser.add_argument('--output-dir', '-o', default=os.getcwd(),
 111                         help='Destination directory for saving')
 112     parser.add_argument('--check', '-c', action='store_true', default=False,
 113                         help='Check mode')
 114     parser.add_argument('--debug', action='store_true', default=False,
 115                         help='Turn on debug output')
 116     parser.add_argument('--workers', type=int, default=None,
 117                         help='Set maximum workers for parallel download (default: cores * 5)')
 118
 119     args = parser.parse_args()
 120
 121     if args.debug:
 122         logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
 123     else:
 124         logging.basicConfig(stream=sys.stdout, level=logging.INFO, format='%(message)s')
 125
 126     downloader = HttpDownloader([args.file_list, args.output_dir], workers=args.workers)
 127
 128     if args.check:
 129         log.info('Check mode. No download will be executed.')
 130         log.info(downloader.check_table)
 131         sys.exit(0)
 132
 133     timer_start = timeit.default_timer()
 134     try:
 135         downloader.download()
 136     except RuntimeError:
 137         sys.exit(1)
 138     finally:
 139         log.info('Downloading finished in {}'.format(
 140             datetime.timedelta(seconds=timeit.default_timer() - timer_start)))
 141
 142
 143 if __name__ == '__main__':
 144     run_cli()