Modified the get_url module to respect the content-disposition header if the destination is a directory and the server provides it.

See http://www.w3.org/Protocols/rfc2616/rfc2616-sec19.html, section 19.5.1.
12 years ago · c85655f720
parent 1cc894f54b
commit c85655f720
1 changed files with 56 additions and 18 deletions
--- a/library/network/get_url
+++ b/library/network/get_url
@ -49,15 +49,20 @@ options:
  dest:
    description:
      - absolute path of where to download the file to.
-      - If I(dest) is a directory, the basename of the file on the remote server will be used. If a directory, C(force=yes) must also be set.
+      - If C(dest) is a directory, either the server provided filename or, if
        none provided, the base name of the URL on the remote server will be
        used. If a directory, C(force) has no effect.
    required: true
    default: null
  force:
    description:
-      - If C(yes), will download the file every time and replace the
+      - If C(yes) and C(dest) is not a directory, will download the file every
-        file if the contents change. If C(no), the file will only be downloaded if
+        time and replace the file if the contents change. If C(no), the file
-        the destination does not exist. Generally should be C(yes) only for small
+        will only be downloaded if the destination does not exist. Generally
-        local files. Prior to 0.6, this module behaved as if C(yes) was the default.
+        should be C(yes) only for small local files. Prior to 0.6, this module
        behaved as if C(yes) was the default.
        Has no effect if C(dest) is a directory - the file will always be
        downloaded, but replaced only if the contents changed.
    version_added: "0.7"
    required: false
    choices: [ "yes", "no" ]
@ -125,7 +130,7 @@ def url_filename(url):
        return 'index.html'
    return fn
-def url_do_get(module, url, dest, use_proxy):
+def url_do_get(module, url, dest, use_proxy, last_mod_time):
    """
    Get url and return request and info
    Credits: http://stackoverflow.com/questions/7006574/how-to-download-file-from-ftp
@ -171,9 +176,8 @@ def url_do_get(module, url, dest, use_proxy):
    request = urllib2.Request(url)
    request.add_header('User-agent', USERAGENT)
-    if os.path.exists(dest) and not module.params['force']:
+    if last_mod_time:
-        t = datetime.datetime.utcfromtimestamp(os.path.getmtime(dest))
+        tstamp = last_mod_time.strftime('%a, %d %b %Y %H:%M:%S +0000')
        tstamp = t.strftime('%a, %d %b %Y %H:%M:%S +0000')
        request.add_header('If-Modified-Since', tstamp)
    try:
@ -190,14 +194,14 @@ def url_do_get(module, url, dest, use_proxy):
    return r, info
-def url_get(module, url, dest, use_proxy):
+def url_get(module, url, dest, use_proxy, last_mod_time):
    """
-    Download url and store at dest.
+    Download data from the url and store in a temporary file.
-    If dest is a directory, determine filename from url.
+
    Return (tempfile, info about the request)
    """
-    req, info = url_do_get(module, url, dest, use_proxy)
+    req, info = url_do_get(module, url, dest, use_proxy, last_mod_time)
    # TODO: should really handle 304, but how? src file could exist (and be newer) but empty
    if info['status'] == 304:
@ -218,6 +222,25 @@ def url_get(module, url, dest, use_proxy):
    req.close()
    return tempname, info
 def extract_filename_from_headers(headers):
    """
    Extracts a filename from the given dict of HTTP headers.
    Looks for the content-disposition header and applies a regex.
    Returns the filename if successful, else None."""
    cont_disp_regex = 'attachment; ?filename="(.+)"'
    res = None
    if 'content-disposition' in headers:
        cont_disp = headers['content-disposition']
        match = re.match(cont_disp_regex, cont_disp)
        if match:
            res = match.group(1)
            # Try preventing any funny business.
            res = os.path.basename(res)
    return res
 # ==============================================================
 # main
@ -247,15 +270,30 @@ def main():
    sha256sum = module.params['sha256sum']
    use_proxy = module.params['use_proxy']
-    if os.path.isdir(dest):
+    dest_is_dir = os.path.isdir(dest)
-        dest = os.path.join(dest, url_filename(url))
+    last_mod_time = None
    if not dest_is_dir and os.path.exists(dest):
        if not force:
        if os.path.exists(dest):
            module.exit_json(msg="file already exists", dest=dest, url=url, changed=False)
        # If the file already exists, prepare the last modified time for the
        # request.
        mtime = os.path.getmtime(dest)
        last_mod_time = datetime.datetime.utcfromtimestamp(mtime)
    # download to tmpsrc
-    tmpsrc, info = url_get(module, url, dest, use_proxy)
+    tmpsrc, info = url_get(module, url, dest, use_proxy, last_mod_time)
    # Now the request has completed, we can finally generate the final
    # destination file name from the info dict.
    if dest_is_dir:
        filename = extract_filename_from_headers(info)
        if not filename:
            # Fall back to extracting the filename from the URL.
            filename = url_filename(url)
        dest = os.path.join(dest, filename)
    md5sum_src   = None
    md5sum_dest  = None