diff --git a/cache/binary/tags.go b/cache/binary/tags.go index dbf86f0..50d79af 100644 --- a/cache/binary/tags.go +++ b/cache/binary/tags.go @@ -15,8 +15,9 @@ package binary // etc.) are converted to a single ASCII control char (0x01-0x1f) import ( - "github.com/omniscale/imposm3/element" "unicode/utf8" + + "github.com/omniscale/imposm3/element" ) type codepoint rune @@ -38,6 +39,8 @@ const maxCodePoint = codepoint('\uF8FF') var nextCodePoint = codepoint('\uE000') +const escapeRune = '\ufffd' // unicode replacement char + func addTagCodePoint(key, value string) { if nextCodePoint > maxCodePoint { panic("all codepoints used!") @@ -71,20 +74,33 @@ func tagsFromArray(arr []string) element.Tags { } result := make(element.Tags) for i := 0; i < len(arr); i += 1 { - if r, size := utf8.DecodeRuneInString(arr[i]); size >= 3 && - codepoint(r) >= minCodePoint && - codepoint(r) < nextCodePoint { - tag, ok := codePointToTag[codepoint(r)] - if !ok { - panic("missing tag for codepoint") + if r, size := utf8.DecodeRuneInString(arr[i]); size >= 3 { + if r == escapeRune { + // remove escape rune + result[arr[i][size:]] = arr[i+1] + i++ + continue + } else if codepoint(r) >= minCodePoint && + codepoint(r) < nextCodePoint { + tag, ok := codePointToTag[codepoint(r)] + if !ok { + panic("missing tag for codepoint") + } + result[tag.Key] = tag.Value + continue } - result[tag.Key] = tag.Value } else if len(arr[i]) > 0 && arr[i][0] < 32 { result[codePointToCommonKey[arr[i][0]]] = arr[i][1:] - } else { - result[arr[i]] = arr[i+1] - i++ + continue } + if len(arr) <= i+1 { + // notify users affected by #112 + // TODO remove check in the future to avoid misleading message + // if a similar issue shows up + panic("Internal cache corrupt, see: https://github.com/omniscale/imposm3/issues/122") + } + result[arr[i]] = arr[i+1] + i++ } return result } @@ -95,21 +111,32 @@ func tagsAsArray(tags element.Tags) []string { } result := make([]string, 0, 2*len(tags)) for key, val := range tags { - if valMap, ok := tagsToCodePoint[key]; ok { - if codePoint, ok := valMap[val]; ok { - result = append(result, string(codePoint)) - continue - } - } - if codepoint, ok := commonKeys[key]; ok { - result = append(result, string(codepoint)+val) - continue - } - result = append(result, key, val) + result = appendTag(result, key, val) } return result } +func appendTag(arr []string, key, val string) []string { + if valMap, ok := tagsToCodePoint[key]; ok { + if codePoint, ok := valMap[val]; ok { + return append(arr, string(codePoint)) + } + } + if codepoint, ok := commonKeys[key]; ok { + return append(arr, string(codepoint)+val) + } + // escape first char/rune if it is a commonKey/tagCodePoint + if key[0] < 32 { + key = string(escapeRune) + key + } else if r, size := utf8.DecodeRuneInString(key); size >= 3 && + (codepoint(r) >= minCodePoint && + codepoint(r) <= maxCodePoint) || + (r == escapeRune) { + key = string(escapeRune) + key + } + return append(arr, key, val) +} + func init() { // // DO NOT EDIT, REMOVE, REORDER ANY OF THE FOLLOWING LINES! diff --git a/cache/binary/tags_test.go b/cache/binary/tags_test.go index c0a18f5..95e455e 100644 --- a/cache/binary/tags_test.go +++ b/cache/binary/tags_test.go @@ -1,9 +1,11 @@ package binary import ( - "github.com/omniscale/imposm3/element" + "reflect" "sort" "testing" + + "github.com/omniscale/imposm3/element" ) func TestTagsAsAndFromArray(t *testing.T) { @@ -34,6 +36,73 @@ func TestTagsAsAndFromArray(t *testing.T) { } } +func TestTagsArrayEncoding(t *testing.T) { + for i, check := range []struct { + // Test tags with string array of key/value instead of + // element.Tags to have fixed order + tags []string + expected []string + }{ + { + []string{"name", "foo", "highway", "residential", "oneway", "yes"}, + []string{ + "\x01foo", + "\ue001", + "\ue008", + }, + }, + { // ascii control characters are escaped + []string{"name", "\tfoo", "\tfoo", "bar"}, + []string{ + "\x01\tfoo", + "\ufffd\tfoo", "bar", + }, + }, + { // private unicode characters are escaped + []string{"\ue008foo", "bar", "\ue008", "baz"}, + []string{ + "\ufffd\ue008foo", "bar", + "\ufffd\ue008", "baz", + }, + }, + { // replacement characters (our escape token) are escaped as well + []string{"\ufffd\ue008foo", "bar", "\ufffd\ufffd\ufffd\ue008foo", "bar"}, + []string{ + "\ufffd\ufffd\ue008foo", "bar", + "\ufffd\ufffd\ufffd\ufffd\ue008foo", "bar", + }, + }, + } { + var actual []string + for i := 0; i < len(check.tags); i += 2 { + actual = appendTag(actual, check.tags[i], check.tags[i+1]) + } + if len(check.expected) != len(actual) { + t.Errorf("case %d: unexpected tag array %#v != %#v", i, actual, check.expected) + continue + } + actualTags := tagsFromArray(actual) + expectedTags := make(element.Tags) + for i := 0; i < len(check.tags); i += 2 { + expectedTags[check.tags[i]] = check.tags[i+1] + } + if !reflect.DeepEqual(actualTags, expectedTags) { + t.Errorf("case %d: unexpected tags %#v != %#v", i, actualTags, expectedTags) + } + } +} + +func TestTagsArrayIssue122Panic(t *testing.T) { + tagsFromArray([]string{"foo", "bar"}) + defer func() { + p := recover() + if p == nil { + t.Fatal("did not panic") + } + }() + tagsFromArray([]string{"foo"}) +} + func TestCodePoints(t *testing.T) { // codepoints should never change, so check a few for sanity if c := tagsToCodePoint["building"]["yes"]; c != codepoint('\ue000') {