Skip to content

Commit d7a55a5

Browse files
committed
Move width_in_str lookup code to lookup.rs
1 parent 7409dde commit d7a55a5

4 files changed

Lines changed: 297 additions & 747 deletions

File tree

scripts/unicode.py

Lines changed: 1 addition & 278 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,7 @@ def emit_rust_file(path: str, generator: Callable[[IO[str]], None]):
108108
f.write(FILE_HEADER)
109109
generator(f)
110110

111+
111112
Codepoint = int
112113
BitPos = int
113114

@@ -1311,281 +1312,6 @@ def lookup_fns(
13111312
None
13121313
}}
13131314
}}
1314-
1315-
/// Returns the [UAX #11](https://www.unicode.org/reports/tr11/) based width of `c`.
1316-
/// Ambiguous width characters are treated as {ambig}.
1317-
{cfg}#[inline]
1318-
pub(crate) fn width_in_str{cjk_lo}(c: char, mut next_info: WidthInfo) -> (i8, WidthInfo) {{
1319-
if next_info.is_emoji_presentation() {{
1320-
if starts_emoji_presentation_seq(c) {{
1321-
let width = if next_info.is_zwj_emoji_presentation() {{
1322-
0
1323-
}} else {{
1324-
2
1325-
}};
1326-
return (width, WidthInfo::EMOJI_PRESENTATION);
1327-
}} else {{
1328-
next_info = next_info.unset_emoji_presentation();
1329-
}}
1330-
}}"""
1331-
1332-
if is_cjk:
1333-
s += """
1334-
if (matches!(
1335-
next_info,
1336-
WidthInfo::COMBINING_LONG_SOLIDUS_OVERLAY | WidthInfo::SOLIDUS_OVERLAY_ALEF
1337-
) && matches!(c, '<' | '=' | '>'))
1338-
{
1339-
return (2, WidthInfo::DEFAULT);
1340-
}"""
1341-
1342-
s += """
1343-
if c <= '\\u{A0}' {
1344-
match c {
1345-
'\\n' => (1, WidthInfo::LINE_FEED),
1346-
'\\r' if next_info == WidthInfo::LINE_FEED => (0, WidthInfo::DEFAULT),
1347-
_ => (1, WidthInfo::DEFAULT),
1348-
}
1349-
} else {
1350-
// Fast path
1351-
if next_info != WidthInfo::DEFAULT {
1352-
if c == '\\u{FE0F}' {
1353-
return (0, next_info.set_emoji_presentation());
1354-
}"""
1355-
1356-
if is_cjk:
1357-
s += """
1358-
if matches!(c, '\\u{FE00}' | '\\u{FE02}') {
1359-
return (0, next_info.set_vs1_2_3());
1360-
}
1361-
"""
1362-
else:
1363-
s += """
1364-
if c == '\\u{FE01}' {
1365-
return (0, next_info.set_vs1_2_3());
1366-
}
1367-
if c == '\\u{FE0E}' {
1368-
return (0, next_info.set_text_presentation());
1369-
}
1370-
if next_info.is_text_presentation() {
1371-
if starts_non_ideographic_text_presentation_seq(c) {
1372-
return (1, WidthInfo::DEFAULT);
1373-
} else {
1374-
next_info = next_info.unset_text_presentation();
1375-
}
1376-
} else """
1377-
1378-
s += """if next_info.is_vs1_2_3() {
1379-
if matches!(c, '\\u{2018}' | '\\u{2019}' | '\\u{201C}' | '\\u{201D}') {
1380-
return ("""
1381-
1382-
s += str(2 - is_cjk)
1383-
1384-
s += """, WidthInfo::DEFAULT);
1385-
} else {
1386-
next_info = next_info.unset_vs1_2_3();
1387-
}
1388-
}
1389-
if next_info.is_ligature_transparent() {
1390-
if c == '\\u{200D}' {
1391-
return (0, next_info.set_zwj_bit());
1392-
} else if is_ligature_transparent(c) {
1393-
return (0, next_info);
1394-
}
1395-
}
1396-
1397-
match (next_info, c) {"""
1398-
if is_cjk:
1399-
s += """
1400-
(WidthInfo::COMBINING_LONG_SOLIDUS_OVERLAY, _) if is_solidus_transparent(c) => {
1401-
return (
1402-
lookup_width_cjk(c).0 as i8,
1403-
WidthInfo::COMBINING_LONG_SOLIDUS_OVERLAY,
1404-
);
1405-
}
1406-
(WidthInfo::JOINING_GROUP_ALEF, '\\u{0338}') => {
1407-
return (0, WidthInfo::SOLIDUS_OVERLAY_ALEF);
1408-
}
1409-
// Arabic Lam-Alef ligature
1410-
(WidthInfo::JOINING_GROUP_ALEF | WidthInfo::SOLIDUS_OVERLAY_ALEF, _)
1411-
if is_joining_group_lam(c) =>
1412-
{
1413-
return (0, WidthInfo::DEFAULT)
1414-
}
1415-
(WidthInfo::JOINING_GROUP_ALEF, _) if is_transparent_zero_width(c) => {
1416-
return (0, WidthInfo::JOINING_GROUP_ALEF);
1417-
}
1418-
"""
1419-
else:
1420-
s += """
1421-
// Arabic Lam-Alef ligature
1422-
(WidthInfo::JOINING_GROUP_ALEF, _) if is_joining_group_lam(c) => {
1423-
return (0, WidthInfo::DEFAULT)
1424-
}
1425-
(WidthInfo::JOINING_GROUP_ALEF, _) if is_transparent_zero_width(c) => {
1426-
return (0, WidthInfo::JOINING_GROUP_ALEF);
1427-
}
1428-
"""
1429-
1430-
s += """
1431-
// Hebrew Alef-ZWJ-Lamed ligature
1432-
(WidthInfo::ZWJ_HEBREW_LETTER_LAMED, '\\u{05D0}') => {
1433-
return (0, WidthInfo::DEFAULT);
1434-
}
1435-
1436-
// Khmer coeng signs
1437-
(WidthInfo::KHMER_COENG_ELIGIBLE_LETTER, '\\u{17D2}') => {
1438-
return (-1, WidthInfo::DEFAULT);
1439-
}
1440-
1441-
// Buginese <a, -i> ZWJ ya ligature
1442-
(WidthInfo::ZWJ_BUGINESE_LETTER_YA, '\\u{1A17}') => {
1443-
return (0, WidthInfo::BUGINESE_VOWEL_SIGN_I_ZWJ_LETTER_YA)
1444-
}
1445-
(WidthInfo::BUGINESE_VOWEL_SIGN_I_ZWJ_LETTER_YA, '\\u{1A15}') => {
1446-
return (0, WidthInfo::DEFAULT)
1447-
}
1448-
1449-
// Tifinagh bi-consonants
1450-
(WidthInfo::TIFINAGH_CONSONANT | WidthInfo::ZWJ_TIFINAGH_CONSONANT, '\\u{2D7F}') => {
1451-
return (1, WidthInfo::TIFINAGH_JOINER_CONSONANT);
1452-
}
1453-
(WidthInfo::ZWJ_TIFINAGH_CONSONANT, '\\u{2D31}'..='\\u{2D65}' | '\\u{2D6F}') => {
1454-
return (0, WidthInfo::DEFAULT);
1455-
}
1456-
(WidthInfo::TIFINAGH_JOINER_CONSONANT, '\\u{2D31}'..='\\u{2D65}' | '\\u{2D6F}') => {
1457-
return (-1, WidthInfo::DEFAULT);
1458-
}
1459-
1460-
// Lisu tone letter combinations
1461-
(WidthInfo::LISU_TONE_LETTER_MYA_NA_JEU, '\\u{A4F8}'..='\\u{A4FB}') => {
1462-
return (0, WidthInfo::DEFAULT);
1463-
}
1464-
1465-
// Old Turkic ligature
1466-
(WidthInfo::ZWJ_OLD_TURKIC_LETTER_ORKHON_I, '\\u{10C32}') => {
1467-
return (0, WidthInfo::DEFAULT);
1468-
}"""
1469-
1470-
s += f"""
1471-
// Emoji modifier
1472-
(WidthInfo::EMOJI_MODIFIER, _) if is_emoji_modifier_base(c) => {{
1473-
return (0, WidthInfo::EMOJI_PRESENTATION);
1474-
}}
1475-
1476-
// Regional indicator
1477-
(
1478-
WidthInfo::REGIONAL_INDICATOR | WidthInfo::SEVERAL_REGIONAL_INDICATOR,
1479-
'\\u{{1F1E6}}'..='\\u{{1F1FF}}',
1480-
) => return (1, WidthInfo::SEVERAL_REGIONAL_INDICATOR),
1481-
1482-
// ZWJ emoji
1483-
(
1484-
WidthInfo::EMOJI_PRESENTATION
1485-
| WidthInfo::SEVERAL_REGIONAL_INDICATOR
1486-
| WidthInfo::EVEN_REGIONAL_INDICATOR_ZWJ_PRESENTATION
1487-
| WidthInfo::ODD_REGIONAL_INDICATOR_ZWJ_PRESENTATION
1488-
| WidthInfo::EMOJI_MODIFIER,
1489-
'\\u{{200D}}',
1490-
) => return (0, WidthInfo::ZWJ_EMOJI_PRESENTATION),
1491-
(WidthInfo::ZWJ_EMOJI_PRESENTATION, '\\u{{20E3}}') => {{
1492-
return (0, WidthInfo::KEYCAP_ZWJ_EMOJI_PRESENTATION);
1493-
}}
1494-
(WidthInfo::VS16_ZWJ_EMOJI_PRESENTATION, _) if starts_emoji_presentation_seq(c) => {{
1495-
return (0, WidthInfo::EMOJI_PRESENTATION)
1496-
}}
1497-
(WidthInfo::VS16_KEYCAP_ZWJ_EMOJI_PRESENTATION, '0'..='9' | '#' | '*') => {{
1498-
return (0, WidthInfo::EMOJI_PRESENTATION)
1499-
}}
1500-
(WidthInfo::ZWJ_EMOJI_PRESENTATION, '\\u{{1F1E6}}'..='\\u{{1F1FF}}') => {{
1501-
return (1, WidthInfo::REGIONAL_INDICATOR_ZWJ_PRESENTATION);
1502-
}}
1503-
(
1504-
WidthInfo::REGIONAL_INDICATOR_ZWJ_PRESENTATION
1505-
| WidthInfo::ODD_REGIONAL_INDICATOR_ZWJ_PRESENTATION,
1506-
'\\u{{1F1E6}}'..='\\u{{1F1FF}}',
1507-
) => return (-1, WidthInfo::EVEN_REGIONAL_INDICATOR_ZWJ_PRESENTATION),
1508-
(
1509-
WidthInfo::EVEN_REGIONAL_INDICATOR_ZWJ_PRESENTATION,
1510-
'\\u{{1F1E6}}'..='\\u{{1F1FF}}',
1511-
) => return (3, WidthInfo::ODD_REGIONAL_INDICATOR_ZWJ_PRESENTATION),
1512-
(WidthInfo::ZWJ_EMOJI_PRESENTATION, '\\u{{1F3FB}}'..='\\u{{1F3FF}}') => {{
1513-
return (0, WidthInfo::EMOJI_MODIFIER);
1514-
}}
1515-
(WidthInfo::ZWJ_EMOJI_PRESENTATION, '\\u{{E007F}}') => {{
1516-
return (0, WidthInfo::TAG_END_ZWJ_EMOJI_PRESENTATION);
1517-
}}
1518-
(WidthInfo::TAG_END_ZWJ_EMOJI_PRESENTATION, '\\u{{E0061}}'..='\\u{{E007A}}') => {{
1519-
return (0, WidthInfo::TAG_A1_END_ZWJ_EMOJI_PRESENTATION);
1520-
}}
1521-
(WidthInfo::TAG_A1_END_ZWJ_EMOJI_PRESENTATION, '\\u{{E0061}}'..='\\u{{E007A}}') => {{
1522-
return (0, WidthInfo::TAG_A2_END_ZWJ_EMOJI_PRESENTATION)
1523-
}}
1524-
(WidthInfo::TAG_A2_END_ZWJ_EMOJI_PRESENTATION, '\\u{{E0061}}'..='\\u{{E007A}}') => {{
1525-
return (0, WidthInfo::TAG_A3_END_ZWJ_EMOJI_PRESENTATION)
1526-
}}
1527-
(WidthInfo::TAG_A3_END_ZWJ_EMOJI_PRESENTATION, '\\u{{E0061}}'..='\\u{{E007A}}') => {{
1528-
return (0, WidthInfo::TAG_A4_END_ZWJ_EMOJI_PRESENTATION)
1529-
}}
1530-
(WidthInfo::TAG_A4_END_ZWJ_EMOJI_PRESENTATION, '\\u{{E0061}}'..='\\u{{E007A}}') => {{
1531-
return (0, WidthInfo::TAG_A5_END_ZWJ_EMOJI_PRESENTATION)
1532-
}}
1533-
(WidthInfo::TAG_A5_END_ZWJ_EMOJI_PRESENTATION, '\\u{{E0061}}'..='\\u{{E007A}}') => {{
1534-
return (0, WidthInfo::TAG_A6_END_ZWJ_EMOJI_PRESENTATION)
1535-
}}
1536-
(
1537-
WidthInfo::TAG_END_ZWJ_EMOJI_PRESENTATION
1538-
| WidthInfo::TAG_A1_END_ZWJ_EMOJI_PRESENTATION
1539-
| WidthInfo::TAG_A2_END_ZWJ_EMOJI_PRESENTATION
1540-
| WidthInfo::TAG_A3_END_ZWJ_EMOJI_PRESENTATION
1541-
| WidthInfo::TAG_A4_END_ZWJ_EMOJI_PRESENTATION,
1542-
'\\u{{E0030}}'..='\\u{{E0039}}',
1543-
) => return (0, WidthInfo::TAG_D1_END_ZWJ_EMOJI_PRESENTATION),
1544-
(WidthInfo::TAG_D1_END_ZWJ_EMOJI_PRESENTATION, '\\u{{E0030}}'..='\\u{{E0039}}') => {{
1545-
return (0, WidthInfo::TAG_D2_END_ZWJ_EMOJI_PRESENTATION);
1546-
}}
1547-
(WidthInfo::TAG_D2_END_ZWJ_EMOJI_PRESENTATION, '\\u{{E0030}}'..='\\u{{E0039}}') => {{
1548-
return (0, WidthInfo::TAG_D3_END_ZWJ_EMOJI_PRESENTATION);
1549-
}}
1550-
(
1551-
WidthInfo::TAG_A3_END_ZWJ_EMOJI_PRESENTATION
1552-
| WidthInfo::TAG_A4_END_ZWJ_EMOJI_PRESENTATION
1553-
| WidthInfo::TAG_A5_END_ZWJ_EMOJI_PRESENTATION
1554-
| WidthInfo::TAG_A6_END_ZWJ_EMOJI_PRESENTATION
1555-
| WidthInfo::TAG_D3_END_ZWJ_EMOJI_PRESENTATION,
1556-
'\\u{{1F3F4}}',
1557-
) => return (0, WidthInfo::EMOJI_PRESENTATION),
1558-
(WidthInfo::ZWJ_EMOJI_PRESENTATION, _)
1559-
if lookup_width{cjk_lo}(c).1 == WidthInfo::EMOJI_PRESENTATION =>
1560-
{{
1561-
return (0, WidthInfo::EMOJI_PRESENTATION)
1562-
}}
1563-
1564-
(WidthInfo::KIRAT_RAI_VOWEL_SIGN_E, '\\u{{16D63}}') => {{
1565-
return (0, WidthInfo::DEFAULT);
1566-
}}
1567-
(WidthInfo::KIRAT_RAI_VOWEL_SIGN_E, '\\u{{16D67}}') => {{
1568-
return (0, WidthInfo::KIRAT_RAI_VOWEL_SIGN_AI);
1569-
}}
1570-
(WidthInfo::KIRAT_RAI_VOWEL_SIGN_E, '\\u{{16D68}}') => {{
1571-
return (1, WidthInfo::KIRAT_RAI_VOWEL_SIGN_E);
1572-
}}
1573-
(WidthInfo::KIRAT_RAI_VOWEL_SIGN_E, '\\u{{16D69}}') => {{
1574-
return (0, WidthInfo::DEFAULT);
1575-
}}
1576-
(WidthInfo::KIRAT_RAI_VOWEL_SIGN_AI, '\\u{{16D63}}') => {{
1577-
return (0, WidthInfo::DEFAULT);
1578-
}}
1579-
1580-
// Fallback
1581-
_ => {{}}
1582-
}}
1583-
}}
1584-
1585-
let ret = lookup_width{cjk_lo}(c);
1586-
(ret.0 as i8, ret.1)
1587-
}}
1588-
}}
15891315
"""
15901316

15911317
return s
@@ -1627,7 +1353,6 @@ def emit_props(
16271353

16281354
module.write(
16291355
"""/// Whether this character has Joining_Group=Lam.
1630-
#[rustfmt::skip]
16311356
pub fn is_joining_group_lam(c: char) -> bool {
16321357
matches!(
16331358
c,
@@ -1650,7 +1375,6 @@ def emit_props(
16501375
16511376
/// Whether this character is a default-ignorable combining mark
16521377
/// or ZWJ. These characters won't interrupt non-Arabic ligatures.
1653-
#[rustfmt::skip]
16541378
pub fn is_ligature_transparent(c: char) -> bool {
16551379
matches!(
16561380
c,
@@ -1774,7 +1498,6 @@ def emit_lookup(
17741498
joining_group_lam: list[tuple[Codepoint, Codepoint]],
17751499
):
17761500
"""Outputs a Rust module to `module` containing generated lookup functions."""
1777-
module.write("use crate::props::*;\n")
17781501
module.write("use crate::tables::*;\n")
17791502
module.write("use crate::width_info::WidthInfo;\n\n")
17801503

0 commit comments

Comments
 (0)