From 245e1c196dab226675a02a8caca5a83373f5e4d4 Mon Sep 17 00:00:00 2001 From: Carlos Martín Nieto Date: Thu, 16 Apr 2015 16:05:12 +0200 Subject: dir: allow a BOM at the beginning of exclude files MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some text editors like Notepad or LibreOffice write an UTF-8 BOM in order to indicate that the file is Unicode text rather than whatever the current locale would indicate. If someone uses such an editor to edit a gitignore file, we are left with those three bytes at the beginning of the file. If we do not skip them, we will attempt to match a filename with the BOM as prefix, which won't match the files the user is expecting. Signed-off-by: Carlos Martín Nieto Signed-off-by: Junio C Hamano --- dir.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'dir.c') diff --git a/dir.c b/dir.c index 3f7a0256b6..10c1f903ef 100644 --- a/dir.c +++ b/dir.c @@ -538,6 +538,7 @@ int add_excludes_from_file_to_list(const char *fname, struct stat st; int fd, i, lineno = 1; size_t size = 0; + static const unsigned char *utf8_bom = (unsigned char *) "\xef\xbb\xbf"; char *buf, *entry; fd = open(fname, O_RDONLY); @@ -574,7 +575,12 @@ int add_excludes_from_file_to_list(const char *fname, } el->filebuf = buf; - entry = buf; + + if (size >= 3 && !memcmp(buf, utf8_bom, 3)) + entry = buf + 3; + else + entry = buf; + for (i = 0; i < size; i++) { if (buf[i] == '\n') { if (entry != buf + i && entry[0] != '#') { -- cgit v1.3 From cb0abea87017559e1db3721a7e6d89a336d845e9 Mon Sep 17 00:00:00 2001 From: Junio C Hamano Date: Thu, 16 Apr 2015 11:26:29 -0700 Subject: add_excludes_from_file: clarify the bom skipping logic Even though the previous step shifts where the "entry" begins, we still iterate over the original buf[], which may begin with the UTF-8 BOM we are supposed to be skipping. At the end of the first line, the code grabs the contents of it starting at "entry", so there is nothing wrong per-se, but the logic looks really confused. Instead, move the buf pointer and shrink its size, to truly pretend that UTF-8 BOM did not exist in the input. Signed-off-by: Junio C Hamano --- dir.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'dir.c') diff --git a/dir.c b/dir.c index 10c1f903ef..b5bb38977b 100644 --- a/dir.c +++ b/dir.c @@ -576,10 +576,11 @@ int add_excludes_from_file_to_list(const char *fname, el->filebuf = buf; - if (size >= 3 && !memcmp(buf, utf8_bom, 3)) - entry = buf + 3; - else - entry = buf; + if (size >= 3 && !memcmp(buf, utf8_bom, 3)) { + buf += 3; + size -= 3; + } + entry = buf; for (i = 0; i < size; i++) { if (buf[i] == '\n') { -- cgit v1.3 From dde843e7378f65004415bd108038659de9ce2abd Mon Sep 17 00:00:00 2001 From: Junio C Hamano Date: Thu, 16 Apr 2015 10:45:29 -0700 Subject: utf8-bom: introduce skip_utf8_bom() helper With the recent change to ignore the UTF8 BOM at the beginning of .gitignore files, we now have two codepaths that do such a skipping (the other one is for reading the configuration files). Introduce utf8_bom[] constant string and skip_utf8_bom() helper and teach .gitignore code how to use it. Signed-off-by: Junio C Hamano --- dir.c | 9 ++++----- utf8.c | 11 +++++++++++ utf8.h | 3 +++ 3 files changed, 18 insertions(+), 5 deletions(-) (limited to 'dir.c') diff --git a/dir.c b/dir.c index b5bb38977b..4c4bf910fa 100644 --- a/dir.c +++ b/dir.c @@ -12,6 +12,7 @@ #include "refs.h" #include "wildmatch.h" #include "pathspec.h" +#include "utf8.h" struct path_simplify { int len; @@ -538,7 +539,6 @@ int add_excludes_from_file_to_list(const char *fname, struct stat st; int fd, i, lineno = 1; size_t size = 0; - static const unsigned char *utf8_bom = (unsigned char *) "\xef\xbb\xbf"; char *buf, *entry; fd = open(fname, O_RDONLY); @@ -576,10 +576,9 @@ int add_excludes_from_file_to_list(const char *fname, el->filebuf = buf; - if (size >= 3 && !memcmp(buf, utf8_bom, 3)) { - buf += 3; - size -= 3; - } + if (skip_utf8_bom(&buf, size)) + size -= buf - el->filebuf; + entry = buf; for (i = 0; i < size; i++) { diff --git a/utf8.c b/utf8.c index 520fbb4994..28e6d76a42 100644 --- a/utf8.c +++ b/utf8.c @@ -633,3 +633,14 @@ int is_hfs_dotgit(const char *path) return 1; } + +const char utf8_bom[] = "\357\273\277"; + +int skip_utf8_bom(char **text, size_t len) +{ + if (len < strlen(utf8_bom) || + memcmp(*text, utf8_bom, strlen(utf8_bom))) + return 0; + *text += strlen(utf8_bom); + return 1; +} diff --git a/utf8.h b/utf8.h index e4d9183c5f..e7b2aa4168 100644 --- a/utf8.h +++ b/utf8.h @@ -13,6 +13,9 @@ int same_encoding(const char *, const char *); __attribute__((format (printf, 2, 3))) int utf8_fprintf(FILE *, const char *, ...); +extern const char utf8_bom[]; +extern int skip_utf8_bom(char **, size_t); + void strbuf_add_wrapped_text(struct strbuf *buf, const char *text, int indent, int indent2, int width); void strbuf_add_wrapped_bytes(struct strbuf *buf, const char *data, int len, -- cgit v1.3