Handle UTF in long filenames (#20087)
Co-authored-by: Scott Lahteine <thinkyhead@users.noreply.github.com>
This commit is contained in:
		| @@ -1256,6 +1256,10 @@ | ||||
|                                       // Note: Only affects SCROLL_LONG_FILENAMES with SDSORT_CACHE_NAMES but not SDSORT_DYNAMIC_RAM. | ||||
|   #endif | ||||
|  | ||||
|   // Allow international symbols in long filenames. To display correctly, the | ||||
|   // LCD's font must contain the characters. Check your selected LCD language. | ||||
|   #define UTF_FILENAME_SUPPORT | ||||
|  | ||||
|   // This allows hosts to request long names for files and folders with M33 | ||||
|   //#define LONG_FILENAME_HOST_SUPPORT | ||||
|  | ||||
|   | ||||
| @@ -9,6 +9,8 @@ | ||||
|  | ||||
| #include "../inc/MarlinConfig.h" | ||||
|  | ||||
| #define MAX_UTF8_CHAR_SIZE 4 | ||||
|  | ||||
| #if HAS_WIRED_LCD | ||||
|   #include "marlinui.h" | ||||
|   #include "../MarlinCore.h" | ||||
| @@ -79,6 +81,8 @@ uint8_t* get_utf8_value_cb(uint8_t *pstart, read_byte_cb_t cb_read_byte, wchar_t | ||||
|   uint32_t val = 0; | ||||
|   uint8_t *p = pstart; | ||||
|  | ||||
|   #define NEXT_6_BITS() do{ val <<= 6; p++; valcur = cb_read_byte(p); val |= (valcur & 0x3F); }while(0) | ||||
|  | ||||
|   uint8_t valcur = cb_read_byte(p); | ||||
|   if (0 == (0x80 & valcur)) { | ||||
|     val = valcur; | ||||
| @@ -86,74 +90,51 @@ uint8_t* get_utf8_value_cb(uint8_t *pstart, read_byte_cb_t cb_read_byte, wchar_t | ||||
|   } | ||||
|   else if (0xC0 == (0xE0 & valcur)) { | ||||
|     val = valcur & 0x1F; | ||||
|     val <<= 6; | ||||
|     p++; | ||||
|     valcur = cb_read_byte(p); | ||||
|     val |= (valcur & 0x3F); | ||||
|     p++; | ||||
|   } | ||||
|   else if (0xE0 == (0xF0 & valcur)) { | ||||
|     val = valcur & 0x0F; | ||||
|     val <<= 6; p++; | ||||
|     valcur = cb_read_byte(p); | ||||
|     val |= (valcur & 0x3F); | ||||
|     val <<= 6; p++; | ||||
|     valcur = cb_read_byte(p); | ||||
|     val |= (valcur & 0x3F); | ||||
|     p++; | ||||
|   } | ||||
|   else if (0xF0 == (0xF8 & valcur)) { | ||||
|     val = valcur & 0x07; | ||||
|     val <<= 6; p++; | ||||
|     valcur = cb_read_byte(p); | ||||
|     val |= (valcur & 0x3F); | ||||
|     val <<= 6; p++; | ||||
|     valcur = cb_read_byte(p); | ||||
|     val |= (valcur & 0x3F); | ||||
|     val <<= 6; p++; | ||||
|     valcur = cb_read_byte(p); | ||||
|     val |= (valcur & 0x3F); | ||||
|     p++; | ||||
|   } | ||||
|   else if (0xF8 == (0xFC & valcur)) { | ||||
|     val = valcur & 0x03; | ||||
|     val <<= 6; p++; | ||||
|     valcur = cb_read_byte(p); | ||||
|     val |= (valcur & 0x3F); | ||||
|     val <<= 6; p++; | ||||
|     valcur = cb_read_byte(p); | ||||
|     val |= (valcur & 0x3F); | ||||
|     val <<= 6; p++; | ||||
|     valcur = cb_read_byte(p); | ||||
|     val |= (valcur & 0x3F); | ||||
|     val <<= 6; p++; | ||||
|     valcur = cb_read_byte(p); | ||||
|     val |= (valcur & 0x3F); | ||||
|     p++; | ||||
|   } | ||||
|   else if (0xFC == (0xFE & valcur)) { | ||||
|     val = valcur & 0x01; | ||||
|     val <<= 6; p++; | ||||
|     valcur = cb_read_byte(p); | ||||
|     val |= (valcur & 0x3F); | ||||
|     val <<= 6; p++; | ||||
|     valcur = cb_read_byte(p); | ||||
|     val |= (valcur & 0x3F); | ||||
|     val <<= 6; p++; | ||||
|     valcur = cb_read_byte(p); | ||||
|     val |= (valcur & 0x3F); | ||||
|     val <<= 6; p++; | ||||
|     valcur = cb_read_byte(p); | ||||
|     val |= (valcur & 0x3F); | ||||
|     val <<= 6; p++; | ||||
|     valcur = cb_read_byte(p); | ||||
|     val |= (valcur & 0x3F); | ||||
|     NEXT_6_BITS(); | ||||
|     p++; | ||||
|   } | ||||
|   #if MAX_UTF8_CHAR_SIZE >= 3 | ||||
|     else if (0xE0 == (0xF0 & valcur)) { | ||||
|       val = valcur & 0x0F; | ||||
|       NEXT_6_BITS(); | ||||
|       NEXT_6_BITS(); | ||||
|       p++; | ||||
|     } | ||||
|   #endif | ||||
|   #if MAX_UTF8_CHAR_SIZE >= 4 | ||||
|     else if (0xF0 == (0xF8 & valcur)) { | ||||
|       val = valcur & 0x07; | ||||
|       NEXT_6_BITS(); | ||||
|       NEXT_6_BITS(); | ||||
|       NEXT_6_BITS(); | ||||
|       p++; | ||||
|     } | ||||
|   #endif | ||||
|   #if MAX_UTF8_CHAR_SIZE >= 5 | ||||
|     else if (0xF8 == (0xFC & valcur)) { | ||||
|       val = valcur & 0x03; | ||||
|       NEXT_6_BITS(); | ||||
|       NEXT_6_BITS(); | ||||
|       NEXT_6_BITS(); | ||||
|       NEXT_6_BITS(); | ||||
|       p++; | ||||
|     } | ||||
|   #endif | ||||
|   #if MAX_UTF8_CHAR_SIZE >= 6 | ||||
|     else if (0xFC == (0xFE & valcur)) { | ||||
|       val = valcur & 0x01; | ||||
|       NEXT_6_BITS(); | ||||
|       NEXT_6_BITS(); | ||||
|       NEXT_6_BITS(); | ||||
|       NEXT_6_BITS(); | ||||
|       NEXT_6_BITS(); | ||||
|       p++; | ||||
|     } | ||||
|   #endif | ||||
|   else if (0x80 == (0xC0 & valcur)) | ||||
|     for (; 0x80 == (0xC0 & valcur); ) { p++; valcur = cb_read_byte(p); } | ||||
|   else | ||||
|     for (; ((0xFE & valcur) > 0xFC); ) { p++; valcur = cb_read_byte(p); } | ||||
|     for (; 0xFC < (0xFE & valcur); ) { p++; valcur = cb_read_byte(p); } | ||||
|  | ||||
|   if (pval) *pval = val; | ||||
|  | ||||
|   | ||||
| @@ -1103,19 +1103,67 @@ int8_t SdBaseFile::readDir(dir_t* dir, char* longFilename) { | ||||
|         if (WITHIN(seq, 1, MAX_VFAT_ENTRIES)) { | ||||
|           // TODO: Store the filename checksum to verify if a long-filename-unaware system modified the file table. | ||||
|           n = (seq - 1) * (FILENAME_LENGTH); | ||||
|           LOOP_L_N(i, FILENAME_LENGTH) | ||||
|             longFilename[n + i] = (i < 5) ? VFAT->name1[i] : (i < 11) ? VFAT->name2[i - 5] : VFAT->name3[i - 11]; | ||||
|           LOOP_L_N(i, FILENAME_LENGTH) { | ||||
|             uint16_t utf16_ch = (i < 5) ? VFAT->name1[i] : (i < 11) ? VFAT->name2[i - 5] : VFAT->name3[i - 11]; | ||||
|             #if ENABLED(UTF_FILENAME_SUPPORT) | ||||
|               // We can't reconvert to UTF-8 here as UTF-8 is variable-size encoding, but joining LFN blocks | ||||
|               // needs static bytes addressing. So here just store full UTF-16LE words to re-convert later. | ||||
|               uint16_t idx = (n + i) * 2; // This is fixed as FAT LFN always contain UTF-16LE encoding | ||||
|               longFilename[idx] = utf16_ch & 0xFF; | ||||
|               longFilename[idx+1] = (utf16_ch >> 8) & 0xFF; | ||||
|             #else | ||||
|               // Replace all multibyte characters to '_' | ||||
|               longFilename[n + i] = (utf16_ch > 0xFF) ? '_' : (utf16_ch & 0xFF); | ||||
|             #endif | ||||
|           } | ||||
|           // If this VFAT entry is the last one, add a NUL terminator at the end of the string | ||||
|           if (VFAT->sequenceNumber & 0x40) longFilename[n + FILENAME_LENGTH] = '\0'; | ||||
|           if (VFAT->sequenceNumber & 0x40) longFilename[(n + FILENAME_LENGTH) * LONG_FILENAME_CHARSIZE] = '\0'; | ||||
|         } | ||||
|       } | ||||
|     } | ||||
|  | ||||
|     // Return if normal file or subdirectory | ||||
|     if (DIR_IS_FILE_OR_SUBDIR(dir)) return n; | ||||
|     if (DIR_IS_FILE_OR_SUBDIR(dir)) { | ||||
|       #if ENABLED(UTF_FILENAME_SUPPORT) | ||||
|         // Convert filename from utf-16 to utf-8 as Marlin expects | ||||
|         #if LONG_FILENAME_CHARSIZE > 2 | ||||
|           // Add warning for developers for currently not supported 3-byte cases (Conversion series of 2-byte | ||||
|           // codepoints to 3-byte in-place will break the rest of filename) | ||||
|           #error "Currently filename re-encoding is done in-place. It may break the remaining chars to use 3-byte codepoints." | ||||
|         #endif | ||||
|         uint16_t currentPos = 0; | ||||
|         LOOP_L_N(i, (LONG_FILENAME_LENGTH / 2)) { | ||||
|           uint16_t idx = i * 2; // This is fixed as FAT LFN always contain UTF-16LE encoding | ||||
|  | ||||
|           uint16_t utf16_ch = longFilename[idx] | (longFilename[idx + 1] << 8); | ||||
|           if (0xD800 == (utf16_ch & 0xF800))                                    // Surrogate pair - encode as '_' | ||||
|             longFilename[currentPos++] = '_'; | ||||
|           else if (0 == (utf16_ch & 0xFF80))                                    // Encode as 1-byte utf-8 char | ||||
|             longFilename[currentPos++] = utf16_ch & 0x007F; | ||||
|           else if (0 == (utf16_ch & 0xF800)) {                                  // Encode as 2-byte utf-8 char | ||||
|             longFilename[currentPos++] = 0xC0 | ((utf16_ch >> 6) & 0x1F); | ||||
|             longFilename[currentPos++] = 0x80 | (utf16_ch & 0x3F); | ||||
|           } | ||||
|           else { | ||||
|             #if LONG_FILENAME_CHARSIZE > 2                                      // Encode as 3-byte utf-8 char | ||||
|               longFilename[currentPos++] = 0xE0 | ((utf16_ch >> 12) & 0x0F); | ||||
|               longFilename[currentPos++] = 0xC0 | ((utf16_ch >> 6) & 0x3F); | ||||
|               longFilename[currentPos++] = 0xC0 | (utf16_ch & 0x3F); | ||||
|             #else                                                               // Encode as '_' | ||||
|               longFilename[currentPos++] = '_'; | ||||
|             #endif | ||||
|           } | ||||
|  | ||||
|           if (0 == utf16_ch) break; // End of filename | ||||
|         } | ||||
|         return currentPos; | ||||
|       #else | ||||
|         return n; | ||||
|       #endif | ||||
|     } | ||||
|   } | ||||
| } | ||||
|  | ||||
|  | ||||
| // Read next directory entry into the cache | ||||
| // Assumes file is correctly positioned | ||||
| dir_t* SdBaseFile::readDirCache() { | ||||
|   | ||||
| @@ -103,5 +103,10 @@ | ||||
|  | ||||
| #define FILENAME_LENGTH 13 // Number of UTF-16 characters per entry | ||||
|  | ||||
| // UTF-8 may use up to 3 bytes to represent single UTF-16 code point. | ||||
| // We discard 3-byte characters allowing only 2-bytes | ||||
| // or 1-byte if UTF_FILENAME_SUPPORT disabled. | ||||
| #define LONG_FILENAME_CHARSIZE TERN(UTF_FILENAME_SUPPORT, 2, 1) | ||||
|  | ||||
| // Total bytes needed to store a single long filename | ||||
| #define LONG_FILENAME_LENGTH (FILENAME_LENGTH * MAX_VFAT_ENTRIES + 1) | ||||
| #define LONG_FILENAME_LENGTH (FILENAME_LENGTH * LONG_FILENAME_CHARSIZE * MAX_VFAT_ENTRIES + 1) | ||||
|   | ||||
| @@ -71,63 +71,49 @@ uint8_t* get_utf8_value(uint8_t *pstart, wchar_t *pval) { | ||||
|  | ||||
|   assert(NULL != pstart); | ||||
|  | ||||
|   #define NEXT_6_BITS() do{ val <<= 6; p++; val |= (*p & 0x3F); }while(0) | ||||
|  | ||||
|   if (0 == (0x80 & *p)) { | ||||
|     val = (size_t)*p; | ||||
|     p++; | ||||
|   } | ||||
|   else if (0xC0 == (0xE0 & *p)) { | ||||
|     val = *p & 0x1F; | ||||
|     val <<= 6; | ||||
|     p++; | ||||
|     val |= (*p & 0x3F); | ||||
|     NEXT_6_BITS(); | ||||
|     p++; | ||||
|     assert((wchar_t)val == get_val_utf82uni(pstart)); | ||||
|   } | ||||
|   else if (0xE0 == (0xF0 & *p)) { | ||||
|     val = *p & 0x0F; | ||||
|     val <<= 6; p++; | ||||
|     val |= (*p & 0x3F); | ||||
|     val <<= 6; p++; | ||||
|     val |= (*p & 0x3F); | ||||
|     NEXT_6_BITS(); | ||||
|     NEXT_6_BITS(); | ||||
|     p++; | ||||
|     assert((wchar_t)val == get_val_utf82uni(pstart)); | ||||
|   } | ||||
|   else if (0xF0 == (0xF8 & *p)) { | ||||
|     val = *p & 0x07; | ||||
|     val <<= 6; p++; | ||||
|     val |= (*p & 0x3F); | ||||
|     val <<= 6; p++; | ||||
|     val |= (*p & 0x3F); | ||||
|     val <<= 6; p++; | ||||
|     val |= (*p & 0x3F); | ||||
|     NEXT_6_BITS(); | ||||
|     NEXT_6_BITS(); | ||||
|     NEXT_6_BITS(); | ||||
|     p++; | ||||
|     assert((wchar_t)val == get_val_utf82uni(pstart)); | ||||
|   } | ||||
|   else if (0xF8 == (0xFC & *p)) { | ||||
|     val = *p & 0x03; | ||||
|     val <<= 6; p++; | ||||
|     val |= (*p & 0x3F); | ||||
|     val <<= 6; p++; | ||||
|     val |= (*p & 0x3F); | ||||
|     val <<= 6; p++; | ||||
|     val |= (*p & 0x3F); | ||||
|     val <<= 6; p++; | ||||
|     val |= (*p & 0x3F); | ||||
|     NEXT_6_BITS(); | ||||
|     NEXT_6_BITS(); | ||||
|     NEXT_6_BITS(); | ||||
|     NEXT_6_BITS(); | ||||
|     p++; | ||||
|     assert((wchar_t)val == get_val_utf82uni(pstart)); | ||||
|   } | ||||
|   else if (0xFC == (0xFE & *p)) { | ||||
|     val = *p & 0x01; | ||||
|     val <<= 6; p++; | ||||
|     val |= (*p & 0x3F); | ||||
|     val <<= 6; p++; | ||||
|     val |= (*p & 0x3F); | ||||
|     val <<= 6; p++; | ||||
|     val |= (*p & 0x3F); | ||||
|     val <<= 6; p++; | ||||
|     val |= (*p & 0x3F); | ||||
|     val <<= 6; p++; | ||||
|     val |= (*p & 0x3F); | ||||
|     NEXT_6_BITS(); | ||||
|     NEXT_6_BITS(); | ||||
|     NEXT_6_BITS(); | ||||
|     NEXT_6_BITS(); | ||||
|     NEXT_6_BITS(); | ||||
|     p++; | ||||
|     assert((wchar_t)val == get_val_utf82uni(pstart)); | ||||
|   } | ||||
|   | ||||
		Reference in New Issue
	
	Block a user