ftp://ftp.kernel.org/pub/linux/kernel/v2.6/linux-2.6.6.tar.bz2
[linux-2.6.git] / fs / udf / unicode.c
1 /*
2  * unicode.c
3  *
4  * PURPOSE
5  *      Routines for converting between UTF-8 and OSTA Compressed Unicode.
6  *      Also handles filename mangling
7  *
8  * DESCRIPTION
9  *      OSTA Compressed Unicode is explained in the OSTA UDF specification.
10  *              http://www.osta.org/
11  *      UTF-8 is explained in the IETF RFC XXXX.
12  *              ftp://ftp.internic.net/rfc/rfcxxxx.txt
13  *
14  * CONTACTS
15  *      E-mail regarding any portion of the Linux UDF file system should be
16  *      directed to the development team's mailing list (run by majordomo):
17  *              linux_udf@hpesjro.fc.hp.com
18  *
19  * COPYRIGHT
20  *      This file is distributed under the terms of the GNU General Public
21  *      License (GPL). Copies of the GPL can be obtained from:
22  *              ftp://prep.ai.mit.edu/pub/gnu/GPL
23  *      Each contributing author retains all rights to their own work.
24  */
25
26 #include "udfdecl.h"
27
28 #include <linux/kernel.h>
29 #include <linux/string.h>       /* for memset */
30 #include <linux/nls.h>
31 #include <linux/udf_fs.h>
32
33 #include "udf_sb.h"
34
35 static int udf_translate_to_linux(uint8_t *, uint8_t *, int, uint8_t *, int);
36
37 static int udf_char_to_ustr(struct ustr *dest, const uint8_t *src, int strlen)
38 {
39         if ( (!dest) || (!src) || (!strlen) || (strlen > UDF_NAME_LEN-2) )
40                 return 0;
41         memset(dest, 0, sizeof(struct ustr));
42         memcpy(dest->u_name, src, strlen);
43         dest->u_cmpID = 0x08;
44         dest->u_len = strlen;
45         return strlen;
46 }
47
48 /*
49  * udf_build_ustr
50  */
51 int udf_build_ustr(struct ustr *dest, dstring *ptr, int size)
52 {
53         int usesize;
54
55         if ( (!dest) || (!ptr) || (!size) )
56                 return -1;
57
58         memset(dest, 0, sizeof(struct ustr));
59         usesize= (size > UDF_NAME_LEN) ? UDF_NAME_LEN : size;
60         dest->u_cmpID=ptr[0];
61         dest->u_len=ptr[size-1];
62         memcpy(dest->u_name, ptr+1, usesize-1);
63         return 0;
64 }
65
66 /*
67  * udf_build_ustr_exact
68  */
69 static int udf_build_ustr_exact(struct ustr *dest, dstring *ptr, int exactsize)
70 {
71         if ( (!dest) || (!ptr) || (!exactsize) )
72                 return -1;
73
74         memset(dest, 0, sizeof(struct ustr));
75         dest->u_cmpID=ptr[0];
76         dest->u_len=exactsize-1;
77         memcpy(dest->u_name, ptr+1, exactsize-1);
78         return 0;
79 }
80
81 /*
82  * udf_ocu_to_utf8
83  *
84  * PURPOSE
85  *      Convert OSTA Compressed Unicode to the UTF-8 equivalent.
86  *
87  * DESCRIPTION
88  *      This routine is only called by udf_filldir().
89  *
90  * PRE-CONDITIONS
91  *      utf                     Pointer to UTF-8 output buffer.
92  *      ocu                     Pointer to OSTA Compressed Unicode input buffer
93  *                              of size UDF_NAME_LEN bytes.
94  *                              both of type "struct ustr *"
95  *
96  * POST-CONDITIONS
97  *      <return>                Zero on success.
98  *
99  * HISTORY
100  *      November 12, 1997 - Andrew E. Mileski
101  *      Written, tested, and released.
102  */
103 int udf_CS0toUTF8(struct ustr *utf_o, struct ustr *ocu_i)
104 {
105         uint8_t *ocu;
106         uint32_t c;
107         uint8_t cmp_id, ocu_len;
108         int i;
109
110         ocu = ocu_i->u_name;
111
112         ocu_len = ocu_i->u_len;
113         cmp_id = ocu_i->u_cmpID;
114         utf_o->u_len = 0;
115
116         if (ocu_len == 0)
117         {
118                 memset(utf_o, 0, sizeof(struct ustr));
119                 utf_o->u_cmpID = 0;
120                 utf_o->u_len = 0;
121                 return 0;
122         }
123
124         if ((cmp_id != 8) && (cmp_id != 16))
125         {
126                 printk(KERN_ERR "udf: unknown compression code (%d) stri=%s\n", cmp_id, ocu_i->u_name);
127                 return 0;
128         }
129
130         for (i = 0; (i < ocu_len) && (utf_o->u_len <= (UDF_NAME_LEN-3)) ;)
131         {
132
133                 /* Expand OSTA compressed Unicode to Unicode */
134                 c = ocu[i++];
135                 if (cmp_id == 16)
136                         c = (c << 8) | ocu[i++];
137
138                 /* Compress Unicode to UTF-8 */
139                 if (c < 0x80U)
140                         utf_o->u_name[utf_o->u_len++] = (uint8_t)c;
141                 else if (c < 0x800U)
142                 {
143                         utf_o->u_name[utf_o->u_len++] = (uint8_t)(0xc0 | (c >> 6));
144                         utf_o->u_name[utf_o->u_len++] = (uint8_t)(0x80 | (c & 0x3f));
145                 }
146                 else
147                 {
148                         utf_o->u_name[utf_o->u_len++] = (uint8_t)(0xe0 | (c >> 12));
149                         utf_o->u_name[utf_o->u_len++] = (uint8_t)(0x80 | ((c >> 6) & 0x3f));
150                         utf_o->u_name[utf_o->u_len++] = (uint8_t)(0x80 | (c & 0x3f));
151                 }
152         }
153         utf_o->u_cmpID=8;
154
155         return utf_o->u_len;
156 }
157
158 /*
159  *
160  * udf_utf8_to_ocu
161  *
162  * PURPOSE
163  *      Convert UTF-8 to the OSTA Compressed Unicode equivalent.
164  *
165  * DESCRIPTION
166  *      This routine is only called by udf_lookup().
167  *
168  * PRE-CONDITIONS
169  *      ocu                     Pointer to OSTA Compressed Unicode output
170  *                              buffer of size UDF_NAME_LEN bytes.
171  *      utf                     Pointer to UTF-8 input buffer.
172  *      utf_len                 Length of UTF-8 input buffer in bytes.
173  *
174  * POST-CONDITIONS
175  *      <return>                Zero on success.
176  *
177  * HISTORY
178  *      November 12, 1997 - Andrew E. Mileski
179  *      Written, tested, and released.
180  */
181 static int udf_UTF8toCS0(dstring *ocu, struct ustr *utf, int length)
182 {
183         unsigned c, i, max_val, utf_char;
184         int utf_cnt, u_len;
185
186         memset(ocu, 0, sizeof(dstring) * length);
187         ocu[0] = 8;
188         max_val = 0xffU;
189
190 try_again:
191         u_len = 0U;
192         utf_char = 0U;
193         utf_cnt = 0U;
194         for (i = 0U; i < utf->u_len; i++)
195         {
196                 c = (uint8_t)utf->u_name[i];
197
198                 /* Complete a multi-byte UTF-8 character */
199                 if (utf_cnt)
200                 {
201                         utf_char = (utf_char << 6) | (c & 0x3fU);
202                         if (--utf_cnt)
203                                 continue;
204                 }
205                 else
206                 {
207                         /* Check for a multi-byte UTF-8 character */
208                         if (c & 0x80U)
209                         {
210                                 /* Start a multi-byte UTF-8 character */
211                                 if ((c & 0xe0U) == 0xc0U)
212                                 {
213                                         utf_char = c & 0x1fU;
214                                         utf_cnt = 1;
215                                 }
216                                 else if ((c & 0xf0U) == 0xe0U)
217                                 {
218                                         utf_char = c & 0x0fU;
219                                         utf_cnt = 2;
220                                 }
221                                 else if ((c & 0xf8U) == 0xf0U)
222                                 {
223                                         utf_char = c & 0x07U;
224                                         utf_cnt = 3;
225                                 }
226                                 else if ((c & 0xfcU) == 0xf8U)
227                                 {
228                                         utf_char = c & 0x03U;
229                                         utf_cnt = 4;
230                                 }
231                                 else if ((c & 0xfeU) == 0xfcU)
232                                 {
233                                         utf_char = c & 0x01U;
234                                         utf_cnt = 5;
235                                 }
236                                 else
237                                         goto error_out;
238                                 continue;
239                         } else
240                                 /* Single byte UTF-8 character (most common) */
241                                 utf_char = c;
242                 }
243
244                 /* Choose no compression if necessary */
245                 if (utf_char > max_val)
246                 {
247                         if ( 0xffU == max_val )
248                         {
249                                 max_val = 0xffffU;
250                                 ocu[0] = (uint8_t)0x10U;
251                                 goto try_again;
252                         }
253                         goto error_out;
254                 }
255
256                 if (max_val == 0xffffU)
257                 {
258                         ocu[++u_len] = (uint8_t)(utf_char >> 8);
259                 }
260                 ocu[++u_len] = (uint8_t)(utf_char & 0xffU);
261         }
262
263
264         if (utf_cnt)
265         {
266 error_out:
267                 ocu[++u_len] = '?';
268                 printk(KERN_DEBUG "udf: bad UTF-8 character\n");
269         }
270
271         ocu[length - 1] = (uint8_t)u_len + 1;
272         return u_len + 1;
273 }
274
275 static int udf_CS0toNLS(struct nls_table *nls, struct ustr *utf_o, struct ustr *ocu_i)
276 {
277         uint8_t *ocu;
278         uint32_t c;
279         uint8_t cmp_id, ocu_len;
280         int i;
281
282         ocu = ocu_i->u_name;
283
284         ocu_len = ocu_i->u_len;
285         cmp_id = ocu_i->u_cmpID;
286         utf_o->u_len = 0;
287
288         if (ocu_len == 0)
289         {
290                 memset(utf_o, 0, sizeof(struct ustr));
291                 utf_o->u_cmpID = 0;
292                 utf_o->u_len = 0;
293                 return 0;
294         }
295
296         if ((cmp_id != 8) && (cmp_id != 16))
297         {
298                 printk(KERN_ERR "udf: unknown compression code (%d) stri=%s\n", cmp_id, ocu_i->u_name);
299                 return 0;
300         }
301
302         for (i = 0; (i < ocu_len) && (utf_o->u_len <= (UDF_NAME_LEN-3)) ;)
303         {
304                 /* Expand OSTA compressed Unicode to Unicode */
305                 c = ocu[i++];
306                 if (cmp_id == 16)
307                         c = (c << 8) | ocu[i++];
308
309                 utf_o->u_len += nls->uni2char(c, &utf_o->u_name[utf_o->u_len], 
310                         UDF_NAME_LEN - utf_o->u_len);
311         }
312         utf_o->u_cmpID=8;
313
314         return utf_o->u_len;
315 }
316
317 static int udf_NLStoCS0(struct nls_table *nls, dstring *ocu, struct ustr *uni, int length)
318 {
319         unsigned len, i, max_val;
320         uint16_t uni_char;
321         int u_len;
322
323         memset(ocu, 0, sizeof(dstring) * length);
324         ocu[0] = 8;
325         max_val = 0xffU;
326
327 try_again:
328         u_len = 0U;
329         for (i = 0U; i < uni->u_len; i++)
330         {
331                 len = nls->char2uni(&uni->u_name[i], uni->u_len-i, &uni_char);
332                 if (len <= 0)
333                         continue;
334
335                 if (uni_char > max_val)
336                 {
337                         max_val = 0xffffU;
338                         ocu[0] = (uint8_t)0x10U;
339                         goto try_again;
340                 }
341                 
342                 if (max_val == 0xffffU)
343                         ocu[++u_len] = (uint8_t)(uni_char >> 8);
344                 ocu[++u_len] = (uint8_t)(uni_char & 0xffU);
345                 i += len - 1;
346         }
347
348         ocu[length - 1] = (uint8_t)u_len + 1;
349         return u_len + 1;
350 }
351
352 int udf_get_filename(struct super_block *sb, uint8_t *sname, uint8_t *dname, int flen)
353 {
354         struct ustr filename, unifilename;
355         int len;
356
357         if (udf_build_ustr_exact(&unifilename, sname, flen))
358         {
359                 return 0;
360         }
361
362         if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8))
363         {
364                 if (!udf_CS0toUTF8(&filename, &unifilename) )
365                 {
366                         udf_debug("Failed in udf_get_filename: sname = %s\n", sname);
367                         return 0;
368                 }
369         }
370         else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP))
371         {
372                 if (!udf_CS0toNLS(UDF_SB(sb)->s_nls_map, &filename, &unifilename) )
373                 {
374                         udf_debug("Failed in udf_get_filename: sname = %s\n", sname);
375                         return 0;
376                 }
377         }
378         else
379                 return 0;
380
381         if ((len = udf_translate_to_linux(dname, filename.u_name, filename.u_len,
382                 unifilename.u_name, unifilename.u_len)))
383         {
384                 return len;
385         }
386         return 0;
387 }
388
389 int udf_put_filename(struct super_block *sb, const uint8_t *sname, uint8_t *dname, int flen)
390 {
391         struct ustr unifilename;
392         int namelen;
393
394         if ( !(udf_char_to_ustr(&unifilename, sname, flen)) )
395         {
396                 return 0;
397         }
398
399         if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8))
400         {
401                 if ( !(namelen = udf_UTF8toCS0(dname, &unifilename, UDF_NAME_LEN)) )
402                 {
403                         return 0;
404                 }
405         }
406         else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP))
407         {
408                 if ( !(namelen = udf_NLStoCS0(UDF_SB(sb)->s_nls_map, dname, &unifilename, UDF_NAME_LEN)) )
409                 {
410                         return 0;
411                 }
412         }
413         else
414                 return 0;
415
416         return namelen;
417 }
418
419 #define ILLEGAL_CHAR_MARK       '_'
420 #define EXT_MARK                        '.'
421 #define CRC_MARK                        '#'
422 #define EXT_SIZE                        5
423
424 static int udf_translate_to_linux(uint8_t *newName, uint8_t *udfName, int udfLen, uint8_t *fidName, int fidNameLen)
425 {
426         int index, newIndex = 0, needsCRC = 0;  
427         int extIndex = 0, newExtIndex = 0, hasExt = 0;
428         unsigned short valueCRC;
429         uint8_t curr;
430         const uint8_t hexChar[] = "0123456789ABCDEF";
431
432         if (udfName[0] == '.' && (udfLen == 1 ||
433                 (udfLen == 2 && udfName[1] == '.')))
434         {
435                 needsCRC = 1;
436                 newIndex = udfLen;
437                 memcpy(newName, udfName, udfLen);
438         }
439         else
440         {       
441                 for (index = 0; index < udfLen; index++)
442                 {
443                         curr = udfName[index];
444                         if (curr == '/' || curr == 0)
445                         {
446                                 needsCRC = 1;
447                                 curr = ILLEGAL_CHAR_MARK;
448                                 while (index+1 < udfLen && (udfName[index+1] == '/' ||
449                                         udfName[index+1] == 0))
450                                         index++;
451                         }
452                         if (curr == EXT_MARK && (udfLen - index - 1) <= EXT_SIZE)
453                         {
454                                 if (udfLen == index + 1)
455                                         hasExt = 0;
456                                 else
457                                 {
458                                         hasExt = 1;
459                                         extIndex = index;
460                                         newExtIndex = newIndex;
461                                 }
462                         }
463                         if (newIndex < 256)
464                                 newName[newIndex++] = curr;
465                         else
466                                 needsCRC = 1;
467                 }
468         }
469         if (needsCRC)
470         {
471                 uint8_t ext[EXT_SIZE];
472                 int localExtIndex = 0;
473
474                 if (hasExt)
475                 {
476                         int maxFilenameLen;
477                         for(index = 0; index<EXT_SIZE && extIndex + index +1 < udfLen;
478                                 index++ )
479                         {
480                                 curr = udfName[extIndex + index + 1];
481
482                                 if (curr == '/' || curr == 0)
483                                 {
484                                         needsCRC = 1;
485                                         curr = ILLEGAL_CHAR_MARK;
486                                         while(extIndex + index + 2 < udfLen && (index + 1 < EXT_SIZE
487                                                 && (udfName[extIndex + index + 2] == '/' ||
488                                                         udfName[extIndex + index + 2] == 0)))
489                                                 index++;
490                                 }
491                                 ext[localExtIndex++] = curr;
492                         }
493                         maxFilenameLen = 250 - localExtIndex;
494                         if (newIndex > maxFilenameLen)
495                                 newIndex = maxFilenameLen;
496                         else
497                                 newIndex = newExtIndex;
498                 }
499                 else if (newIndex > 250)
500                         newIndex = 250;
501                 newName[newIndex++] = CRC_MARK;
502                 valueCRC = udf_crc(fidName, fidNameLen, 0);
503                 newName[newIndex++] = hexChar[(valueCRC & 0xf000) >> 12];
504                 newName[newIndex++] = hexChar[(valueCRC & 0x0f00) >> 8];
505                 newName[newIndex++] = hexChar[(valueCRC & 0x00f0) >> 4];
506                 newName[newIndex++] = hexChar[(valueCRC & 0x000f)];
507
508                 if (hasExt)
509                 {
510                         newName[newIndex++] = EXT_MARK;
511                         for (index = 0;index < localExtIndex ;index++ )
512                                 newName[newIndex++] = ext[index];
513                 }
514         }
515         return newIndex;
516 }