From bf197fd7eebcb3579dd659af35822ce88adc66c8 Mon Sep 17 00:00:00 2001
From: Jeff King <peff@peff.net>
Date: Thu, 22 May 2014 05:29:47 -0400
Subject: http: extract type/subtype portion of content-type

When we get a content-type from curl, we get the whole
header line, including any parameters, and without any
normalization (like downcasing or whitespace) applied.
If we later try to match it with strcmp() or even
strcasecmp(), we may get false negatives.

This could cause two visible behaviors:

  1. We might fail to recognize a smart-http server by its
     content-type.

  2. We might fail to relay text/plain error messages to
     users (especially if they contain a charset parameter).

This patch teaches the http code to extract and normalize
just the type/subtype portion of the string. This is
technically passing out less information to the callers, who
can no longer see the parameters. But none of the current
callers cares, and a future patch will add back an
easier-to-use method for accessing those parameters.

Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 http.c | 38 +++++++++++++++++++++++++++++++++++---
 1 file changed, 35 insertions(+), 3 deletions(-)

(limited to 'http.c')

diff --git a/http.c b/http.c
index 94e1afdee7..6bfd0934b3 100644
--- a/http.c
+++ b/http.c
@@ -906,6 +906,35 @@ static CURLcode curlinfo_strbuf(CURL *curl, CURLINFO info, struct strbuf *buf)
 	return ret;
 }
 
+/*
+ * Extract a normalized version of the content type, with any
+ * spaces suppressed, all letters lowercased, and no trailing ";"
+ * or parameters.
+ *
+ * Note that we will silently remove even invalid whitespace. For
+ * example, "text / plain" is specifically forbidden by RFC 2616,
+ * but "text/plain" is the only reasonable output, and this keeps
+ * our code simple.
+ *
+ * Example:
+ *   "TEXT/PLAIN; charset=utf-8" -> "text/plain"
+ *   "text / plain" -> "text/plain"
+ */
+static void extract_content_type(struct strbuf *raw, struct strbuf *type)
+{
+	const char *p;
+
+	strbuf_reset(type);
+	strbuf_grow(type, raw->len);
+	for (p = raw->buf; *p; p++) {
+		if (isspace(*p))
+			continue;
+		if (*p == ';')
+			break;
+		strbuf_addch(type, tolower(*p));
+	}
+}
+
 /* http_request() targets */
 #define HTTP_REQUEST_STRBUF	0
 #define HTTP_REQUEST_FILE	1
@@ -957,9 +986,12 @@ static int http_request(const char *url,
 
 	ret = run_one_slot(slot, &results);
 
-	if (options && options->content_type)
-		curlinfo_strbuf(slot->curl, CURLINFO_CONTENT_TYPE,
-				options->content_type);
+	if (options && options->content_type) {
+		struct strbuf raw = STRBUF_INIT;
+		curlinfo_strbuf(slot->curl, CURLINFO_CONTENT_TYPE, &raw);
+		extract_content_type(&raw, options->content_type);
+		strbuf_release(&raw);
+	}
 
 	if (options && options->effective_url)
 		curlinfo_strbuf(slot->curl, CURLINFO_EFFECTIVE_URL,
-- 
cgit v1.3


From e31316263af98c4583be39b469f3152a23eba91d Mon Sep 17 00:00:00 2001
From: Jeff King <peff@peff.net>
Date: Thu, 22 May 2014 05:30:05 -0400
Subject: http: optionally extract charset parameter from content-type

Since the previous commit, we now give a sanitized,
shortened version of the content-type header to any callers
who ask for it.

This patch adds back a way for them to cleanly access
specific parameters to the type. We could easily extract all
parameters and make them available via a string_list, but:

  1. That complicates the interface and memory management.

  2. In practice, no planned callers care about anything
     except the charset.

This patch therefore goes with the simplest thing, and we
can expand or change the interface later if it becomes
necessary.

Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 http.c | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++----
 http.h |  7 +++++++
 2 files changed, 57 insertions(+), 4 deletions(-)

(limited to 'http.c')

diff --git a/http.c b/http.c
index 6bfd0934b3..84463dff3d 100644
--- a/http.c
+++ b/http.c
@@ -906,6 +906,32 @@ static CURLcode curlinfo_strbuf(CURL *curl, CURLINFO info, struct strbuf *buf)
 	return ret;
 }
 
+/*
+ * Check for and extract a content-type parameter. "raw"
+ * should be positioned at the start of the potential
+ * parameter, with any whitespace already removed.
+ *
+ * "name" is the name of the parameter. The value is appended
+ * to "out".
+ */
+static int extract_param(const char *raw, const char *name,
+			 struct strbuf *out)
+{
+	size_t len = strlen(name);
+
+	if (strncasecmp(raw, name, len))
+		return -1;
+	raw += len;
+
+	if (*raw != '=')
+		return -1;
+	raw++;
+
+	while (*raw && !isspace(*raw))
+		strbuf_addch(out, *raw++);
+	return 0;
+}
+
 /*
  * Extract a normalized version of the content type, with any
  * spaces suppressed, all letters lowercased, and no trailing ";"
@@ -916,11 +942,15 @@ static CURLcode curlinfo_strbuf(CURL *curl, CURLINFO info, struct strbuf *buf)
  * but "text/plain" is the only reasonable output, and this keeps
  * our code simple.
  *
+ * If the "charset" argument is not NULL, store the value of any
+ * charset parameter there.
+ *
  * Example:
- *   "TEXT/PLAIN; charset=utf-8" -> "text/plain"
+ *   "TEXT/PLAIN; charset=utf-8" -> "text/plain", "utf-8"
  *   "text / plain" -> "text/plain"
  */
-static void extract_content_type(struct strbuf *raw, struct strbuf *type)
+static void extract_content_type(struct strbuf *raw, struct strbuf *type,
+				 struct strbuf *charset)
 {
 	const char *p;
 
@@ -929,10 +959,25 @@ static void extract_content_type(struct strbuf *raw, struct strbuf *type)
 	for (p = raw->buf; *p; p++) {
 		if (isspace(*p))
 			continue;
-		if (*p == ';')
+		if (*p == ';') {
+			p++;
 			break;
+		}
 		strbuf_addch(type, tolower(*p));
 	}
+
+	if (!charset)
+		return;
+
+	strbuf_reset(charset);
+	while (*p) {
+		while (isspace(*p))
+			p++;
+		if (!extract_param(p, "charset", charset))
+			return;
+		while (*p && !isspace(*p))
+			p++;
+	}
 }
 
 /* http_request() targets */
@@ -989,7 +1034,8 @@ static int http_request(const char *url,
 	if (options && options->content_type) {
 		struct strbuf raw = STRBUF_INIT;
 		curlinfo_strbuf(slot->curl, CURLINFO_CONTENT_TYPE, &raw);
-		extract_content_type(&raw, options->content_type);
+		extract_content_type(&raw, options->content_type,
+				     options->charset);
 		strbuf_release(&raw);
 	}
 
diff --git a/http.h b/http.h
index e64084fe6d..473179b14d 100644
--- a/http.h
+++ b/http.h
@@ -143,6 +143,13 @@ struct http_get_options {
 	/* If non-NULL, returns the content-type of the response. */
 	struct strbuf *content_type;
 
+	/*
+	 * If non-NULL, and content_type above is non-NULL, returns
+	 * the charset parameter from the content-type. If none is
+	 * present, returns an empty string.
+	 */
+	struct strbuf *charset;
+
 	/*
 	 * If non-NULL, returns the URL we ended up at, including any
 	 * redirects we followed.
-- 
cgit v1.3


From c553fd1c1ef76688b6200e66a825b530b0b02140 Mon Sep 17 00:00:00 2001
From: Jeff King <peff@peff.net>
Date: Thu, 22 May 2014 05:36:12 -0400
Subject: http: default text charset to iso-8859-1

This is specified by RFC 2616 as the default if no "charset"
parameter is given.

Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 http.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'http.c')

diff --git a/http.c b/http.c
index 84463dff3d..2b4f6a357c 100644
--- a/http.c
+++ b/http.c
@@ -978,6 +978,9 @@ static void extract_content_type(struct strbuf *raw, struct strbuf *type,
 		while (*p && !isspace(*p))
 			p++;
 	}
+
+	if (!charset->len && starts_with(type->buf, "text/"))
+		strbuf_addstr(charset, "ISO-8859-1");
 }
 
 /* http_request() targets */
-- 
cgit v1.3