@@ -1299,7 +1299,28 @@ - (NSString *)diff_text2:(NSMutableArray *)diffs;
|
1299 | 1299 | - (NSString *)diff_toDelta:(NSMutableArray *)diffs;
|
1300 | 1300 | {
|
1301 | 1301 | NSMutableString *delta = [NSMutableString string];
|
| 1302 | +UniChar lastEnd = 0; |
1302 | 1303 | for (Diff *aDiff in diffs) {
|
| 1304 | +if (0 == [aDiff.text length]) { |
| 1305 | +continue; |
| 1306 | +} |
| 1307 | + |
| 1308 | +UniChar thisTop = [aDiff.text characterAtIndex:0]; |
| 1309 | +UniChar thisEnd = [aDiff.text characterAtIndex:([aDiff.text length]-1)]; |
| 1310 | + |
| 1311 | +if (CFStringIsSurrogateHighCharacter(thisEnd)) { |
| 1312 | +lastEnd = thisEnd; |
| 1313 | +aDiff.text = [aDiff.text substringToIndex:([aDiff.text length] - 1)]; |
| 1314 | +} |
| 1315 | + |
| 1316 | +if (0 != lastEnd && CFStringIsSurrogateHighCharacter(lastEnd) && CFStringIsSurrogateLowCharacter(thisTop)) { |
| 1317 | +aDiff.text = [NSString stringWithFormat:@"%C%@", lastEnd, aDiff.text]; |
| 1318 | +} |
| 1319 | + |
| 1320 | +if (0 == [aDiff.text length]) { |
| 1321 | +continue; |
| 1322 | +} |
| 1323 | + |
1303 | 1324 | switch (aDiff.operation) {
|
1304 | 1325 | case DIFF_INSERT:
|
1305 | 1326 | [delta appendFormat:@"+%@\t", [[aDiff.text diff_stringByAddingPercentEscapesForEncodeUriCompatibility]
|
@@ -1321,6 +1342,176 @@ - (NSString *)diff_toDelta:(NSMutableArray *)diffs;
|
1321 | 1342 | return delta;
|
1322 | 1343 | }
|
1323 | 1344 |
|
| 1345 | +- (NSUInteger)diff_digit16:(unichar)c |
| 1346 | +{ |
| 1347 | +switch (c) { |
| 1348 | +case '0': return 0; |
| 1349 | +case '1': return 1; |
| 1350 | +case '2': return 2; |
| 1351 | +case '3': return 3; |
| 1352 | +case '4': return 4; |
| 1353 | +case '5': return 5; |
| 1354 | +case '6': return 6; |
| 1355 | +case '7': return 7; |
| 1356 | +case '8': return 8; |
| 1357 | +case '9': return 9; |
| 1358 | +case 'A': case 'a': return 10; |
| 1359 | +case 'B': case 'b': return 11; |
| 1360 | +case 'C': case 'c': return 12; |
| 1361 | +case 'D': case 'd': return 13; |
| 1362 | +case 'E': case 'e': return 14; |
| 1363 | +case 'F': case 'f': return 15; |
| 1364 | +default: |
| 1365 | +[NSException raise:@"Invalid percent-encoded string" format:@"%c is not a hex digit", c]; |
| 1366 | +} |
| 1367 | +} |
| 1368 | + |
| 1369 | +/** |
| 1370 | +* Decode a percent-encoded UTF-8 string into a string of UTF-16 code units |
| 1371 | +* This is more permissive than `stringByRemovingPercentEncoding` because |
| 1372 | +* that fails if the input represents invalid Unicode characters. However, different |
| 1373 | +* diff-match- libraries may encode surrogate halves as if they were valid |
| 1374 | +* Unicode code points. Therefore, instead of failing or corrupting the output, which |
| 1375 | +* `stringByRemovingPercentEncoding` does when it inserts "(null)" in these places |
| 1376 | +* we can decode it anyway and then once the string is reconstructed from the diffs |
| 1377 | +* we'll end up with valid Unicode again, after the surrogate halves are re-joined |
| 1378 | +*/ |
| 1379 | +- (NSString *)diff_decodeURIWithText:(NSString *)percentEncoded |
| 1380 | +{ |
| 1381 | +unichar decoded[[percentEncoded length]]; |
| 1382 | +NSInteger input = 0; |
| 1383 | +NSInteger output = 0; |
| 1384 | + |
| 1385 | +@try { |
| 1386 | +while (input < [percentEncoded length]) { |
| 1387 | +unichar c = [percentEncoded characterAtIndex:input]; |
| 1388 | + |
| 1389 | +// not special, so just return it |
| 1390 | +if ('%' != c) { |
| 1391 | +decoded[output++] = c; |
| 1392 | +input += 1; |
| 1393 | +continue; |
| 1394 | +} |
| 1395 | + |
| 1396 | +NSUInteger byte1 = ([self diff_digit16:[percentEncoded characterAtIndex:(input+1)]] << 4) + |
| 1397 | +[self diff_digit16:[percentEncoded characterAtIndex:(input+2)]]; |
| 1398 | + |
| 1399 | +// single-byte UTF-8 first byte has bitmask 0xxx xxxx |
| 1400 | +if ((byte1 & 0x80) == 0) { |
| 1401 | +decoded[output++] = byte1; |
| 1402 | +input += 3; |
| 1403 | +continue; |
| 1404 | +} |
| 1405 | + |
| 1406 | +// at least one continuation byte |
| 1407 | +if ('%' != [percentEncoded characterAtIndex:(input + 3)]) { |
| 1408 | +return nil; |
| 1409 | +} |
| 1410 | + |
| 1411 | +NSUInteger byte2 = ([self diff_digit16:[percentEncoded characterAtIndex:(input+4)]] << 4) + |
| 1412 | +[self diff_digit16:[percentEncoded characterAtIndex:(input+5)]]; |
| 1413 | + |
| 1414 | +// continuation bytes have bitmask 10xx xxxx |
| 1415 | +if ((byte2 & 0xC0) != 0x80) { |
| 1416 | +return nil; |
| 1417 | +} |
| 1418 | + |
| 1419 | +// continuation bytes thus only contribute six bits each |
| 1420 | +// these data bits are found with the bit mask xx11 1111 |
| 1421 | +byte2 = byte2 & 0x3F; |
| 1422 | + |
| 1423 | +// in two-byte sequences the first byte has bitmask 110x xxxx |
| 1424 | +if ((byte1 & 0xE0) == 0xC0) { |
| 1425 | +// byte1 ___x xxxx << 6 |
| 1426 | +// byte2 __yy yyyy |
| 1427 | +// value x xxxxyy yyyy -> 11 bits |
| 1428 | +decoded[output++] = ((byte1 & 0x1F) << 6) | byte2; |
| 1429 | +input += 6; |
| 1430 | +continue; |
| 1431 | +} |
| 1432 | + |
| 1433 | +// at least two continuation bytes |
| 1434 | +if ('%' != [percentEncoded characterAtIndex:(input + 6)]) { |
| 1435 | +return nil; |
| 1436 | +} |
| 1437 | + |
| 1438 | +NSUInteger byte3 = ([self diff_digit16:[percentEncoded characterAtIndex:(input+7)]] << 4) + |
| 1439 | +[self diff_digit16:[percentEncoded characterAtIndex:(input+8)]]; |
| 1440 | + |
| 1441 | +if ((byte3 & 0xC0) != 0x80) { |
| 1442 | +return nil; |
| 1443 | +} |
| 1444 | + |
| 1445 | +byte3 = byte3 & 0x3F; |
| 1446 | + |
| 1447 | +// in three-byte sequences the first byte has bitmask 1110 xxxx |
| 1448 | +if ((byte1 & 0xF0) == 0xE0) { |
| 1449 | +// byte1 ____ xxxx << 12 |
| 1450 | +// byte2 __yy yyyy << 6 |
| 1451 | +// byte3 __zz zzzz |
| 1452 | +// value xxxxyy yyyyzz zzzz -> 16 bits |
| 1453 | +decoded[output++] = ((byte1 & 0x0F) << 12) | (byte2 << 6) | byte3; |
| 1454 | +input += 9; |
| 1455 | +continue; |
| 1456 | +} |
| 1457 | + |
| 1458 | +// three continuation bytes |
| 1459 | +if ('%' != [percentEncoded characterAtIndex:(input + 9)]) { |
| 1460 | +return nil; |
| 1461 | +} |
| 1462 | + |
| 1463 | +NSUInteger byte4 = ([self diff_digit16:[percentEncoded characterAtIndex:(input+10)]] << 4) + |
| 1464 | +[self diff_digit16:[percentEncoded characterAtIndex:(input+11)]]; |
| 1465 | + |
| 1466 | +if ((byte4 & 0xC0) != 0x80) { |
| 1467 | +return nil; |
| 1468 | +} |
| 1469 | + |
| 1470 | +byte4 = byte4 & 0x3F; |
| 1471 | + |
| 1472 | +// in four-byte sequences the first byte has bitmask 1111 0xxx |
| 1473 | +if ((byte1 & 0xF8) == 0xF0) { |
| 1474 | +// byte1 ____ _xxx << 18 |
| 1475 | +// byte2 __yy yyyy << 12 |
| 1476 | +// byte3 __zz zzzz << 6 |
| 1477 | +// byte4 __tt tttt |
| 1478 | +// value xxxyy yyyyzz zzzztt tttt -> 21 bits |
| 1479 | +NSUInteger codePoint = ((byte1 & 0x07) << 0x12) | (byte2 << 0x0C) | (byte3 << 0x06) | byte4; |
| 1480 | +if (codePoint >= 0x010000 && codePoint <= 0x10FFFF) { |
| 1481 | +codePoint -= 0x010000; |
| 1482 | +decoded[output++] = ((codePoint >> 10) & 0x3FF) | 0xD800; |
| 1483 | +decoded[output++] = 0xDC00 | (codePoint & 0x3FF); |
| 1484 | +input += 12; |
| 1485 | +continue; |
| 1486 | +} |
| 1487 | +} |
| 1488 | + |
| 1489 | +return nil; |
| 1490 | +} |
| 1491 | +} |
| 1492 | +@catch (NSException *e) { |
| 1493 | +return nil; |
| 1494 | +} |
| 1495 | + |
| 1496 | +// some objective-c versions of the library produced es with |
| 1497 | +// (null) in the place where surrogates were split across diff |
| 1498 | +// boundaries. if we leave those in we'll be stuck with a |
| 1499 | +// high-surrogate (null) low-surrogate pattern that will break |
| 1500 | +// deeper in the library or consuming application. we'll "fix" |
| 1501 | +// these by dropping the (null) and re-joining the surrogate halves |
| 1502 | +NSString *result = [NSString stringWithCharacters:decoded length:output]; |
| 1503 | +NSRegularExpression *replacer = [NSRegularExpression |
| 1504 | +regularExpressionWithPattern:@"([\\x{D800}-\\x{DBFF}])\\(null\\)([\\x{DC00}-\\x{DFFF}])" |
| 1505 | +options:0 |
| 1506 | +error:nil]; |
| 1507 | + |
| 1508 | +return [replacer |
| 1509 | +stringByReplacingMatchesInString:result |
| 1510 | +options:0 |
| 1511 | +range:NSMakeRange(0, [result length]) |
| 1512 | +withTemplate:@"$1$2"]; |
| 1513 | +} |
| 1514 | + |
1324 | 1515 | /**
|
1325 | 1516 | * Given the original text1, and an encoded NSString which describes the
|
1326 | 1517 | * operations required to transform text1 into text2, compute the full diff.
|
@@ -1348,7 +1539,7 @@ - (NSMutableArray *)diff_fromDeltaWithText:(NSString *)text1
|
1348 | 1539 | NSString *param = [token substringFromIndex:1];
|
1349 | 1540 | switch ([token characterAtIndex:0]) {
|
1350 | 1541 | case '+':
|
1351 |
| -param = [param diff_stringByReplacingPercentEscapesForEncodeUriCompatibility]; |
| 1542 | +param = [self diff_decodeURIWithText:param]; |
1352 | 1543 | if (param == nil) {
|
1353 | 1544 | if (error != NULL) {
|
1354 | 1545 | errorDetail = [NSDictionary dictionaryWithObjectsAndKeys:
|
|
0 commit comments