From 8036cb0cab54feb0314a3e69e098b52fddee5296 Mon Sep 17 00:00:00 2001
From: Tin Tvrtkovic <tinchester@gmail.com>
Date: Sat, 2 Nov 2013 17:23:59 +0100
Subject: [PATCH 1/2] Modified the get_url module to respect the
 content-disposition header if the destination is a directory and the server
 provides it. See http://www.w3.org/Protocols/rfc2616/rfc2616-sec19.html,
 section 19.5.1.

---
 network/get_url | 74 +++++++++++++++++++++++++++++++++++++------------
 1 file changed, 56 insertions(+), 18 deletions(-)

diff --git a/network/get_url b/network/get_url
index 35d724febed..58b1eb16aad 100644
--- a/network/get_url
+++ b/network/get_url
@@ -49,15 +49,20 @@ options:
   dest:
     description:
       - absolute path of where to download the file to.
-      - If I(dest) is a directory, the basename of the file on the remote server will be used. If a directory, C(force=yes) must also be set.
+      - If C(dest) is a directory, either the server provided filename or, if
+        none provided, the base name of the URL on the remote server will be
+        used. If a directory, C(force) has no effect.
     required: true
     default: null
   force:
     description:
-      - If C(yes), will download the file every time and replace the
-        file if the contents change. If C(no), the file will only be downloaded if
-        the destination does not exist. Generally should be C(yes) only for small
-        local files. Prior to 0.6, this module behaved as if C(yes) was the default.
+      - If C(yes) and C(dest) is not a directory, will download the file every
+        time and replace the file if the contents change. If C(no), the file
+        will only be downloaded if the destination does not exist. Generally
+        should be C(yes) only for small local files. Prior to 0.6, this module
+        behaved as if C(yes) was the default.
+        Has no effect if C(dest) is a directory - the file will always be
+        downloaded, but replaced only if the contents changed.
     version_added: "0.7"
     required: false
     choices: [ "yes", "no" ]
@@ -125,7 +130,7 @@ def url_filename(url):
         return 'index.html'
     return fn
 
-def url_do_get(module, url, dest, use_proxy):
+def url_do_get(module, url, dest, use_proxy, last_mod_time):
     """
     Get url and return request and info
     Credits: http://stackoverflow.com/questions/7006574/how-to-download-file-from-ftp
@@ -171,9 +176,8 @@ def url_do_get(module, url, dest, use_proxy):
     request = urllib2.Request(url)
     request.add_header('User-agent', USERAGENT)
 
-    if os.path.exists(dest) and not module.params['force']:
-        t = datetime.datetime.utcfromtimestamp(os.path.getmtime(dest))
-        tstamp = t.strftime('%a, %d %b %Y %H:%M:%S +0000')
+    if last_mod_time:
+        tstamp = last_mod_time.strftime('%a, %d %b %Y %H:%M:%S +0000')
         request.add_header('If-Modified-Since', tstamp)
 
     try:
@@ -190,14 +194,14 @@ def url_do_get(module, url, dest, use_proxy):
 
     return r, info
 
-def url_get(module, url, dest, use_proxy):
+def url_get(module, url, dest, use_proxy, last_mod_time):
     """
-    Download url and store at dest.
-    If dest is a directory, determine filename from url.
+    Download data from the url and store in a temporary file.
+
     Return (tempfile, info about the request)
     """
 
-    req, info = url_do_get(module, url, dest, use_proxy)
+    req, info = url_do_get(module, url, dest, use_proxy, last_mod_time)
 
     # TODO: should really handle 304, but how? src file could exist (and be newer) but empty
     if info['status'] == 304:
@@ -218,6 +222,25 @@ def url_get(module, url, dest, use_proxy):
     req.close()
     return tempname, info
 
+def extract_filename_from_headers(headers):
+    """
+    Extracts a filename from the given dict of HTTP headers.
+
+    Looks for the content-disposition header and applies a regex.
+    Returns the filename if successful, else None."""
+    cont_disp_regex = 'attachment; ?filename="(.+)"'
+    res = None
+
+    if 'content-disposition' in headers:
+        cont_disp = headers['content-disposition']
+        match = re.match(cont_disp_regex, cont_disp)
+        if match:
+            res = match.group(1)
+            # Try preventing any funny business.
+            res = os.path.basename(res)
+
+    return res
+
 # ==============================================================
 # main
 
@@ -247,15 +270,30 @@ def main():
     sha256sum = module.params['sha256sum']
     use_proxy = module.params['use_proxy']
 
-    if os.path.isdir(dest):
-        dest = os.path.join(dest, url_filename(url))
+    dest_is_dir = os.path.isdir(dest)
+    last_mod_time = None
 
-    if not force:
-        if os.path.exists(dest):
+    if not dest_is_dir and os.path.exists(dest):
+        if not force:
             module.exit_json(msg="file already exists", dest=dest, url=url, changed=False)
 
+        # If the file already exists, prepare the last modified time for the
+        # request.
+        mtime = os.path.getmtime(dest)
+        last_mod_time = datetime.datetime.utcfromtimestamp(mtime)
+
     # download to tmpsrc
-    tmpsrc, info = url_get(module, url, dest, use_proxy)
+    tmpsrc, info = url_get(module, url, dest, use_proxy, last_mod_time)
+
+    # Now the request has completed, we can finally generate the final
+    # destination file name from the info dict.
+    if dest_is_dir:
+        filename = extract_filename_from_headers(info)
+        if not filename:
+            # Fall back to extracting the filename from the URL.
+            filename = url_filename(url)
+        dest = os.path.join(dest, filename)
+
     md5sum_src   = None
     md5sum_dest  = None
 

From b1fa35ac3d854ef2bd22e73557ede38cf0580985 Mon Sep 17 00:00:00 2001
From: Tin Tvrtkovic <tinchester@gmail.com>
Date: Sat, 9 Nov 2013 00:35:14 +0100
Subject: [PATCH 2/2] Use the final URL from the finished request instead of
 the provided URL for filename generation, to properly deal with redirects.

---
 network/get_url | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/network/get_url b/network/get_url
index 58b1eb16aad..19169b8fb19 100644
--- a/network/get_url
+++ b/network/get_url
@@ -183,11 +183,11 @@ def url_do_get(module, url, dest, use_proxy, last_mod_time):
     try:
         r = urllib2.urlopen(request)
         info.update(r.info())
+        info['url'] = r.geturl()  # The URL goes in too, because of redirects.
         info.update(dict(msg="OK (%s bytes)" % r.headers.get('Content-Length', 'unknown'), status=200))
     except urllib2.HTTPError, e:
         # Must not fail_json() here so caller can handle HTTP 304 unmodified
         info.update(dict(msg=str(e), status=e.code))
-        return r, info
     except urllib2.URLError, e:
         code = getattr(e, 'code', -1)
         module.fail_json(msg="Request failed: %s" % str(e), status_code=code)
@@ -287,11 +287,14 @@ def main():
 
     # Now the request has completed, we can finally generate the final
     # destination file name from the info dict.
+
     if dest_is_dir:
         filename = extract_filename_from_headers(info)
         if not filename:
             # Fall back to extracting the filename from the URL.
-            filename = url_filename(url)
+            # Pluck the URL from the info, since a redirect could have changed
+            # it.
+            filename = url_filename(info['url'])
         dest = os.path.join(dest, filename)
 
     md5sum_src   = None