This URL has Read-Only access.

Statistics
| Branch: | Tag: | Revision:

root / src / tinyxml / tinyxmlparser.cpp @ f4cf6bf7

History | View | Annotate | Download (36.5 kB)

1
/*
2
www.sourceforge.net/projects/tinyxml
3
Original code (2.0 and earlier )copyright (c) 2000-2002 Lee Thomason (www.grinninglizard.com)
4

    
5
This software is provided 'as-is', without any express or implied 
6
warranty. In no event will the authors be held liable for any 
7
damages arising from the use of this software.
8

    
9
Permission is granted to anyone to use this software for any 
10
purpose, including commercial applications, and to alter it and 
11
redistribute it freely, subject to the following restrictions:
12

    
13
1. The origin of this software must not be misrepresented; you must 
14
not claim that you wrote the original software. If you use this
15
software in a product, an acknowledgment in the product documentation
16
would be appreciated but is not required.
17

    
18
2. Altered source versions must be plainly marked as such, and 
19
must not be misrepresented as being the original software.
20

    
21
3. This notice may not be removed or altered from any source 
22
distribution.
23
*/
24

    
25
#include <ctype.h>
26
#include <stddef.h>
27

    
28
#include "tinyxml.h"
29

    
30
//#define DEBUG_PARSER
31
#if defined( DEBUG_PARSER )
32
#        if defined( DEBUG ) && defined( _MSC_VER )
33
#                include <windows.h>
34
#                define TIXML_LOG OutputDebugString
35
#        else
36
#                define TIXML_LOG printf
37
#        endif
38
#endif
39

    
40
// Note tha "PutString" hardcodes the same list. This
41
// is less flexible than it appears. Changing the entries
42
// or order will break putstring.        
43
TiXmlBase::Entity TiXmlBase::entity[ NUM_ENTITY ] = 
44
{
45
        { "&amp;",  5, '&' },
46
        { "&lt;",   4, '<' },
47
        { "&gt;",   4, '>' },
48
        { "&quot;", 6, '\"' },
49
        { "&apos;", 6, '\'' }
50
};
51

    
52
// Bunch of unicode info at:
53
//                http://www.unicode.org/faq/utf_bom.html
54
// Including the basic of this table, which determines the #bytes in the
55
// sequence from the lead byte. 1 placed for invalid sequences --
56
// although the result will be junk, pass it through as much as possible.
57
// Beware of the non-characters in UTF-8:        
58
//                                ef bb bf (Microsoft "lead bytes")
59
//                                ef bf be
60
//                                ef bf bf 
61

    
62
const unsigned char TIXML_UTF_LEAD_0 = 0xefU;
63
const unsigned char TIXML_UTF_LEAD_1 = 0xbbU;
64
const unsigned char TIXML_UTF_LEAD_2 = 0xbfU;
65

    
66
const int TiXmlBase::utf8ByteTable[256] = 
67
{
68
        //        0        1        2        3        4        5        6        7        8        9        a        b        c        d        e        f
69
                1,        1,        1,        1,        1,        1,        1,        1,        1,        1,        1,        1,        1,        1,        1,        1,        // 0x00
70
                1,        1,        1,        1,        1,        1,        1,        1,        1,        1,        1,        1,        1,        1,        1,        1,        // 0x10
71
                1,        1,        1,        1,        1,        1,        1,        1,        1,        1,        1,        1,        1,        1,        1,        1,        // 0x20
72
                1,        1,        1,        1,        1,        1,        1,        1,        1,        1,        1,        1,        1,        1,        1,        1,        // 0x30
73
                1,        1,        1,        1,        1,        1,        1,        1,        1,        1,        1,        1,        1,        1,        1,        1,        // 0x40
74
                1,        1,        1,        1,        1,        1,        1,        1,        1,        1,        1,        1,        1,        1,        1,        1,        // 0x50
75
                1,        1,        1,        1,        1,        1,        1,        1,        1,        1,        1,        1,        1,        1,        1,        1,        // 0x60
76
                1,        1,        1,        1,        1,        1,        1,        1,        1,        1,        1,        1,        1,        1,        1,        1,        // 0x70        End of ASCII range
77
                1,        1,        1,        1,        1,        1,        1,        1,        1,        1,        1,        1,        1,        1,        1,        1,        // 0x80 0x80 to 0xc1 invalid
78
                1,        1,        1,        1,        1,        1,        1,        1,        1,        1,        1,        1,        1,        1,        1,        1,        // 0x90 
79
                1,        1,        1,        1,        1,        1,        1,        1,        1,        1,        1,        1,        1,        1,        1,        1,        // 0xa0 
80
                1,        1,        1,        1,        1,        1,        1,        1,        1,        1,        1,        1,        1,        1,        1,        1,        // 0xb0 
81
                1,        1,        2,        2,        2,        2,        2,        2,        2,        2,        2,        2,        2,        2,        2,        2,        // 0xc0 0xc2 to 0xdf 2 byte
82
                2,        2,        2,        2,        2,        2,        2,        2,        2,        2,        2,        2,        2,        2,        2,        2,        // 0xd0
83
                3,        3,        3,        3,        3,        3,        3,        3,        3,        3,        3,        3,        3,        3,        3,        3,        // 0xe0 0xe0 to 0xef 3 byte
84
                4,        4,        4,        4,        4,        1,        1,        1,        1,        1,        1,        1,        1,        1,        1,        1        // 0xf0 0xf0 to 0xf4 4 byte, 0xf5 and higher invalid
85
};
86

    
87

    
88
void TiXmlBase::ConvertUTF32ToUTF8( unsigned long input, char* output, int* length )
89
{
90
        const unsigned long BYTE_MASK = 0xBF;
91
        const unsigned long BYTE_MARK = 0x80;
92
        const unsigned long FIRST_BYTE_MARK[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
93

    
94
        if (input < 0x80) 
95
                *length = 1;
96
        else if ( input < 0x800 )
97
                *length = 2;
98
        else if ( input < 0x10000 )
99
                *length = 3;
100
        else if ( input < 0x200000 )
101
                *length = 4;
102
        else
103
                { *length = 0; return; }        // This code won't covert this correctly anyway.
104

    
105
        output += *length;
106

    
107
        // Scary scary fall throughs.
108
        switch (*length) 
109
        {
110
                case 4:
111
                        --output; 
112
                        *output = (char)((input | BYTE_MARK) & BYTE_MASK); 
113
                        input >>= 6;
114
                case 3:
115
                        --output; 
116
                        *output = (char)((input | BYTE_MARK) & BYTE_MASK); 
117
                        input >>= 6;
118
                case 2:
119
                        --output; 
120
                        *output = (char)((input | BYTE_MARK) & BYTE_MASK); 
121
                        input >>= 6;
122
                case 1:
123
                        --output; 
124
                        *output = (char)(input | FIRST_BYTE_MARK[*length]);
125
        }
126
}
127

    
128

    
129
/*static*/ int TiXmlBase::IsAlpha( unsigned char anyByte, TiXmlEncoding /*encoding*/ )
130
{
131
        // This will only work for low-ascii, everything else is assumed to be a valid
132
        // letter. I'm not sure this is the best approach, but it is quite tricky trying
133
        // to figure out alhabetical vs. not across encoding. So take a very 
134
        // conservative approach.
135

    
136
//        if ( encoding == TIXML_ENCODING_UTF8 )
137
//        {
138
                if ( anyByte < 127 )
139
                        return isalpha( anyByte );
140
                else
141
                        return 1;        // What else to do? The unicode set is huge...get the english ones right.
142
//        }
143
//        else
144
//        {
145
//                return isalpha( anyByte );
146
//        }
147
}
148

    
149

    
150
/*static*/ int TiXmlBase::IsAlphaNum( unsigned char anyByte, TiXmlEncoding /*encoding*/ )
151
{
152
        // This will only work for low-ascii, everything else is assumed to be a valid
153
        // letter. I'm not sure this is the best approach, but it is quite tricky trying
154
        // to figure out alhabetical vs. not across encoding. So take a very 
155
        // conservative approach.
156

    
157
//        if ( encoding == TIXML_ENCODING_UTF8 )
158
//        {
159
                if ( anyByte < 127 )
160
                        return isalnum( anyByte );
161
                else
162
                        return 1;        // What else to do? The unicode set is huge...get the english ones right.
163
//        }
164
//        else
165
//        {
166
//                return isalnum( anyByte );
167
//        }
168
}
169

    
170

    
171
class TiXmlParsingData
172
{
173
        friend class TiXmlDocument;
174
  public:
175
        void Stamp( const char* now, TiXmlEncoding encoding );
176

    
177
        const TiXmlCursor& Cursor()        { return cursor; }
178

    
179
  private:
180
        // Only used by the document!
181
        TiXmlParsingData( const char* start, int _tabsize, int row, int col )
182
        {
183
                assert( start );
184
                stamp = start;
185
                tabsize = _tabsize;
186
                cursor.row = row;
187
                cursor.col = col;
188
        }
189

    
190
        TiXmlCursor                cursor;
191
        const char*                stamp;
192
        int                                tabsize;
193
};
194

    
195

    
196
void TiXmlParsingData::Stamp( const char* now, TiXmlEncoding encoding )
197
{
198
        assert( now );
199

    
200
        // Do nothing if the tabsize is 0.
201
        if ( tabsize < 1 )
202
        {
203
                return;
204
        }
205

    
206
        // Get the current row, column.
207
        int row = cursor.row;
208
        int col = cursor.col;
209
        const char* p = stamp;
210
        assert( p );
211

    
212
        while ( p < now )
213
        {
214
                // Treat p as unsigned, so we have a happy compiler.
215
                const unsigned char* pU = (const unsigned char*)p;
216

    
217
                // Code contributed by Fletcher Dunn: (modified by lee)
218
                switch (*pU) {
219
                        case 0:
220
                                // We *should* never get here, but in case we do, don't
221
                                // advance past the terminating null character, ever
222
                                return;
223

    
224
                        case '\r':
225
                                // bump down to the next line
226
                                ++row;
227
                                col = 0;                                
228
                                // Eat the character
229
                                ++p;
230

    
231
                                // Check for \r\n sequence, and treat this as a single character
232
                                if (*p == '\n') {
233
                                        ++p;
234
                                }
235
                                break;
236

    
237
                        case '\n':
238
                                // bump down to the next line
239
                                ++row;
240
                                col = 0;
241

    
242
                                // Eat the character
243
                                ++p;
244

    
245
                                // Check for \n\r sequence, and treat this as a single
246
                                // character.  (Yes, this bizarre thing does occur still
247
                                // on some arcane platforms...)
248
                                if (*p == '\r') {
249
                                        ++p;
250
                                }
251
                                break;
252

    
253
                        case '\t':
254
                                // Eat the character
255
                                ++p;
256

    
257
                                // Skip to next tab stop
258
                                col = (col / tabsize + 1) * tabsize;
259
                                break;
260

    
261
                        case TIXML_UTF_LEAD_0:
262
                                if ( encoding == TIXML_ENCODING_UTF8 )
263
                                {
264
                                        if ( *(p+1) && *(p+2) )
265
                                        {
266
                                                // In these cases, don't advance the column. These are
267
                                                // 0-width spaces.
268
                                                if ( *(pU+1)==TIXML_UTF_LEAD_1 && *(pU+2)==TIXML_UTF_LEAD_2 )
269
                                                        p += 3;        
270
                                                else if ( *(pU+1)==0xbfU && *(pU+2)==0xbeU )
271
                                                        p += 3;        
272
                                                else if ( *(pU+1)==0xbfU && *(pU+2)==0xbfU )
273
                                                        p += 3;        
274
                                                else
275
                                                        { p +=3; ++col; }        // A normal character.
276
                                        }
277
                                }
278
                                else
279
                                {
280
                                        ++p;
281
                                        ++col;
282
                                }
283
                                break;
284

    
285
                        default:
286
                                if ( encoding == TIXML_ENCODING_UTF8 )
287
                                {
288
                                        // Eat the 1 to 4 byte utf8 character.
289
                                        int step = TiXmlBase::utf8ByteTable[*((const unsigned char*)p)];
290
                                        if ( step == 0 )
291
                                                step = 1;                // Error case from bad encoding, but handle gracefully.
292
                                        p += step;
293

    
294
                                        // Just advance one column, of course.
295
                                        ++col;
296
                                }
297
                                else
298
                                {
299
                                        ++p;
300
                                        ++col;
301
                                }
302
                                break;
303
                }
304
        }
305
        cursor.row = row;
306
        cursor.col = col;
307
        assert( cursor.row >= -1 );
308
        assert( cursor.col >= -1 );
309
        stamp = p;
310
        assert( stamp );
311
}
312

    
313

    
314
const char* TiXmlBase::SkipWhiteSpace( const char* p, TiXmlEncoding encoding )
315
{
316
        if ( !p || !*p )
317
        {
318
                return 0;
319
        }
320
        if ( encoding == TIXML_ENCODING_UTF8 )
321
        {
322
                while ( *p )
323
                {
324
                        const unsigned char* pU = (const unsigned char*)p;
325
                        
326
                        // Skip the stupid Microsoft UTF-8 Byte order marks
327
                        if (        *(pU+0)==TIXML_UTF_LEAD_0
328
                                 && *(pU+1)==TIXML_UTF_LEAD_1 
329
                                 && *(pU+2)==TIXML_UTF_LEAD_2 )
330
                        {
331
                                p += 3;
332
                                continue;
333
                        }
334
                        else if(*(pU+0)==TIXML_UTF_LEAD_0
335
                                 && *(pU+1)==0xbfU
336
                                 && *(pU+2)==0xbeU )
337
                        {
338
                                p += 3;
339
                                continue;
340
                        }
341
                        else if(*(pU+0)==TIXML_UTF_LEAD_0
342
                                 && *(pU+1)==0xbfU
343
                                 && *(pU+2)==0xbfU )
344
                        {
345
                                p += 3;
346
                                continue;
347
                        }
348

    
349
                        if ( IsWhiteSpace( *p ) || *p == '\n' || *p =='\r' )                // Still using old rules for white space.
350
                                ++p;
351
                        else
352
                                break;
353
                }
354
        }
355
        else
356
        {
357
                while ( *p && IsWhiteSpace( *p ) || *p == '\n' || *p =='\r' )
358
                        ++p;
359
        }
360

    
361
        return p;
362
}
363

    
364
#ifdef TIXML_USE_STL
365
/*static*/ bool TiXmlBase::StreamWhiteSpace( std::istream * in, TIXML_STRING * tag )
366
{
367
        for( ;; )
368
        {
369
                if ( !in->good() ) return false;
370

    
371
                int c = in->peek();
372
                // At this scope, we can't get to a document. So fail silently.
373
                if ( !IsWhiteSpace( c ) || c <= 0 )
374
                        return true;
375

    
376
                *tag += (char) in->get();
377
        }
378
}
379

    
380
/*static*/ bool TiXmlBase::StreamTo( std::istream * in, int character, TIXML_STRING * tag )
381
{
382
        //assert( character > 0 && character < 128 );        // else it won't work in utf-8
383
        while ( in->good() )
384
        {
385
                int c = in->peek();
386
                if ( c == character )
387
                        return true;
388
                if ( c <= 0 )                // Silent failure: can't get document at this scope
389
                        return false;
390

    
391
                in->get();
392
                *tag += (char) c;
393
        }
394
        return false;
395
}
396
#endif
397

    
398
// One of TinyXML's more performance demanding functions. Try to keep the memory overhead down. The
399
// "assign" optimization removes over 10% of the execution time.
400
//
401
const char* TiXmlBase::ReadName( const char* p, TIXML_STRING * name, TiXmlEncoding encoding )
402
{
403
        // Oddly, not supported on some comilers,
404
        //name->clear();
405
        // So use this:
406
        *name = "";
407
        assert( p );
408

    
409
        // Names start with letters or underscores.
410
        // Of course, in unicode, tinyxml has no idea what a letter *is*. The
411
        // algorithm is generous.
412
        //
413
        // After that, they can be letters, underscores, numbers,
414
        // hyphens, or colons. (Colons are valid ony for namespaces,
415
        // but tinyxml can't tell namespaces from names.)
416
        if (    p && *p 
417
                 && ( IsAlpha( (unsigned char) *p, encoding ) || *p == '_' ) )
418
        {
419
                const char* start = p;
420
                while(                p && *p
421
                                &&        (                IsAlphaNum( (unsigned char ) *p, encoding ) 
422
                                                 || *p == '_'
423
                                                 || *p == '-'
424
                                                 || *p == '.'
425
                                                 || *p == ':' ) )
426
                {
427
                        //(*name) += *p; // expensive
428
                        ++p;
429
                }
430
                if ( p-start > 0 ) {
431
                        name->assign( start, p-start );
432
                }
433
                return p;
434
        }
435
        return 0;
436
}
437

    
438
const char* TiXmlBase::GetEntity( const char* p, char* value, int* length, TiXmlEncoding encoding )
439
{
440
        // Presume an entity, and pull it out.
441
    TIXML_STRING ent;
442
        int i;
443
        *length = 0;
444

    
445
        if ( *(p+1) && *(p+1) == '#' && *(p+2) )
446
        {
447
                unsigned long ucs = 0;
448
                ptrdiff_t delta = 0;
449
                unsigned mult = 1;
450

    
451
                if ( *(p+2) == 'x' )
452
                {
453
                        // Hexadecimal.
454
                        if ( !*(p+3) ) return 0;
455

    
456
                        const char* q = p+3;
457
                        q = strchr( q, ';' );
458

    
459
                        if ( !q || !*q ) return 0;
460

    
461
                        delta = q-p;
462
                        --q;
463

    
464
                        while ( *q != 'x' )
465
                        {
466
                                if ( *q >= '0' && *q <= '9' )
467
                                        ucs += mult * (*q - '0');
468
                                else if ( *q >= 'a' && *q <= 'f' )
469
                                        ucs += mult * (*q - 'a' + 10);
470
                                else if ( *q >= 'A' && *q <= 'F' )
471
                                        ucs += mult * (*q - 'A' + 10 );
472
                                else 
473
                                        return 0;
474
                                mult *= 16;
475
                                --q;
476
                        }
477
                }
478
                else
479
                {
480
                        // Decimal.
481
                        if ( !*(p+2) ) return 0;
482

    
483
                        const char* q = p+2;
484
                        q = strchr( q, ';' );
485

    
486
                        if ( !q || !*q ) return 0;
487

    
488
                        delta = q-p;
489
                        --q;
490

    
491
                        while ( *q != '#' )
492
                        {
493
                                if ( *q >= '0' && *q <= '9' )
494
                                        ucs += mult * (*q - '0');
495
                                else 
496
                                        return 0;
497
                                mult *= 10;
498
                                --q;
499
                        }
500
                }
501
                if ( encoding == TIXML_ENCODING_UTF8 )
502
                {
503
                        // convert the UCS to UTF-8
504
                        ConvertUTF32ToUTF8( ucs, value, length );
505
                }
506
                else
507
                {
508
                        *value = (char)ucs;
509
                        *length = 1;
510
                }
511
                return p + delta + 1;
512
        }
513

    
514
        // Now try to match it.
515
        for( i=0; i<NUM_ENTITY; ++i )
516
        {
517
                if ( strncmp( entity[i].str, p, entity[i].strLength ) == 0 )
518
                {
519
                        assert( strlen( entity[i].str ) == entity[i].strLength );
520
                        *value = entity[i].chr;
521
                        *length = 1;
522
                        return ( p + entity[i].strLength );
523
                }
524
        }
525

    
526
        // So it wasn't an entity, its unrecognized, or something like that.
527
        *value = *p;        // Don't put back the last one, since we return it!
528
        //*length = 1;        // Leave unrecognized entities - this doesn't really work.
529
                                        // Just writes strange XML.
530
        return p+1;
531
}
532

    
533

    
534
bool TiXmlBase::StringEqual( const char* p,
535
                                                         const char* tag,
536
                                                         bool ignoreCase,
537
                                                         TiXmlEncoding encoding )
538
{
539
        assert( p );
540
        assert( tag );
541
        if ( !p || !*p )
542
        {
543
                assert( 0 );
544
                return false;
545
        }
546

    
547
        const char* q = p;
548

    
549
        if ( ignoreCase )
550
        {
551
                while ( *q && *tag && ToLower( *q, encoding ) == ToLower( *tag, encoding ) )
552
                {
553
                        ++q;
554
                        ++tag;
555
                }
556

    
557
                if ( *tag == 0 )
558
                        return true;
559
        }
560
        else
561
        {
562
                while ( *q && *tag && *q == *tag )
563
                {
564
                        ++q;
565
                        ++tag;
566
                }
567

    
568
                if ( *tag == 0 )                // Have we found the end of the tag, and everything equal?
569
                        return true;
570
        }
571
        return false;
572
}
573

    
574
const char* TiXmlBase::ReadText(        const char* p, 
575
                                                                        TIXML_STRING * text, 
576
                                                                        bool trimWhiteSpace, 
577
                                                                        const char* endTag, 
578
                                                                        bool caseInsensitive,
579
                                                                        TiXmlEncoding encoding )
580
{
581
    *text = "";
582
        if (    !trimWhiteSpace                        // certain tags always keep whitespace
583
                 || !condenseWhiteSpace )        // if true, whitespace is always kept
584
        {
585
                // Keep all the white space.
586
                while (           p && *p
587
                                && !StringEqual( p, endTag, caseInsensitive, encoding )
588
                          )
589
                {
590
                        int len;
591
                        char cArr[4] = { 0, 0, 0, 0 };
592
                        p = GetChar( p, cArr, &len, encoding );
593
                        text->append( cArr, len );
594
                }
595
        }
596
        else
597
        {
598
                bool whitespace = false;
599

    
600
                // Remove leading white space:
601
                p = SkipWhiteSpace( p, encoding );
602
                while (           p && *p
603
                                && !StringEqual( p, endTag, caseInsensitive, encoding ) )
604
                {
605
                        if ( *p == '\r' || *p == '\n' )
606
                        {
607
                                whitespace = true;
608
                                ++p;
609
                        }
610
                        else if ( IsWhiteSpace( *p ) )
611
                        {
612
                                whitespace = true;
613
                                ++p;
614
                        }
615
                        else
616
                        {
617
                                // If we've found whitespace, add it before the
618
                                // new character. Any whitespace just becomes a space.
619
                                if ( whitespace )
620
                                {
621
                                        (*text) += ' ';
622
                                        whitespace = false;
623
                                }
624
                                int len;
625
                                char cArr[4] = { 0, 0, 0, 0 };
626
                                p = GetChar( p, cArr, &len, encoding );
627
                                if ( len == 1 )
628
                                        (*text) += cArr[0];        // more efficient
629
                                else
630
                                        text->append( cArr, len );
631
                        }
632
                }
633
        }
634
        if ( p ) 
635
                p += strlen( endTag );
636
        return p;
637
}
638

    
639
#ifdef TIXML_USE_STL
640

    
641
void TiXmlDocument::StreamIn( std::istream * in, TIXML_STRING * tag )
642
{
643
        // The basic issue with a document is that we don't know what we're
644
        // streaming. Read something presumed to be a tag (and hope), then
645
        // identify it, and call the appropriate stream method on the tag.
646
        //
647
        // This "pre-streaming" will never read the closing ">" so the
648
        // sub-tag can orient itself.
649

    
650
        if ( !StreamTo( in, '<', tag ) ) 
651
        {
652
                SetError( TIXML_ERROR_PARSING_EMPTY, 0, 0, TIXML_ENCODING_UNKNOWN );
653
                return;
654
        }
655

    
656
        while ( in->good() )
657
        {
658
                int tagIndex = (int) tag->length();
659
                while ( in->good() && in->peek() != '>' )
660
                {
661
                        int c = in->get();
662
                        if ( c <= 0 )
663
                        {
664
                                SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
665
                                break;
666
                        }
667
                        (*tag) += (char) c;
668
                }
669

    
670
                if ( in->good() )
671
                {
672
                        // We now have something we presume to be a node of 
673
                        // some sort. Identify it, and call the node to
674
                        // continue streaming.
675
                        TiXmlNode* node = Identify( tag->c_str() + tagIndex, TIXML_DEFAULT_ENCODING );
676

    
677
                        if ( node )
678
                        {
679
                                node->StreamIn( in, tag );
680
                                bool isElement = node->ToElement() != 0;
681
                                delete node;
682
                                node = 0;
683

    
684
                                // If this is the root element, we're done. Parsing will be
685
                                // done by the >> operator.
686
                                if ( isElement )
687
                                {
688
                                        return;
689
                                }
690
                        }
691
                        else
692
                        {
693
                                SetError( TIXML_ERROR, 0, 0, TIXML_ENCODING_UNKNOWN );
694
                                return;
695
                        }
696
                }
697
        }
698
        // We should have returned sooner.
699
        SetError( TIXML_ERROR, 0, 0, TIXML_ENCODING_UNKNOWN );
700
}
701

    
702
#endif
703

    
704
const char* TiXmlDocument::Parse( const char* p, TiXmlParsingData* prevData, TiXmlEncoding encoding )
705
{
706
        ClearError();
707

    
708
        // Parse away, at the document level. Since a document
709
        // contains nothing but other tags, most of what happens
710
        // here is skipping white space.
711
        if ( !p || !*p )
712
        {
713
                SetError( TIXML_ERROR_DOCUMENT_EMPTY, 0, 0, TIXML_ENCODING_UNKNOWN );
714
                return 0;
715
        }
716

    
717
        // Note that, for a document, this needs to come
718
        // before the while space skip, so that parsing
719
        // starts from the pointer we are given.
720
        location.Clear();
721
        if ( prevData )
722
        {
723
                location.row = prevData->cursor.row;
724
                location.col = prevData->cursor.col;
725
        }
726
        else
727
        {
728
                location.row = 0;
729
                location.col = 0;
730
        }
731
        TiXmlParsingData data( p, TabSize(), location.row, location.col );
732
        location = data.Cursor();
733

    
734
        if ( encoding == TIXML_ENCODING_UNKNOWN )
735
        {
736
                // Check for the Microsoft UTF-8 lead bytes.
737
                const unsigned char* pU = (const unsigned char*)p;
738
                if (        *(pU+0) && *(pU+0) == TIXML_UTF_LEAD_0
739
                         && *(pU+1) && *(pU+1) == TIXML_UTF_LEAD_1
740
                         && *(pU+2) && *(pU+2) == TIXML_UTF_LEAD_2 )
741
                {
742
                        encoding = TIXML_ENCODING_UTF8;
743
                        useMicrosoftBOM = true;
744
                }
745
        }
746

    
747
    p = SkipWhiteSpace( p, encoding );
748
        if ( !p )
749
        {
750
                SetError( TIXML_ERROR_DOCUMENT_EMPTY, 0, 0, TIXML_ENCODING_UNKNOWN );
751
                return 0;
752
        }
753

    
754
        while ( p && *p )
755
        {
756
                TiXmlNode* node = Identify( p, encoding );
757
                if ( node )
758
                {
759
                        p = node->Parse( p, &data, encoding );
760
                        LinkEndChild( node );
761
                }
762
                else
763
                {
764
                        break;
765
                }
766

    
767
                // Did we get encoding info?
768
                if (    encoding == TIXML_ENCODING_UNKNOWN
769
                         && node->ToDeclaration() )
770
                {
771
                        TiXmlDeclaration* dec = node->ToDeclaration();
772
                        const char* enc = dec->Encoding();
773
                        assert( enc );
774

    
775
                        if ( *enc == 0 )
776
                                encoding = TIXML_ENCODING_UTF8;
777
                        else if ( StringEqual( enc, "UTF-8", true, TIXML_ENCODING_UNKNOWN ) )
778
                                encoding = TIXML_ENCODING_UTF8;
779
                        else if ( StringEqual( enc, "UTF8", true, TIXML_ENCODING_UNKNOWN ) )
780
                                encoding = TIXML_ENCODING_UTF8;        // incorrect, but be nice
781
                        else 
782
                                encoding = TIXML_ENCODING_LEGACY;
783
                }
784

    
785
                p = SkipWhiteSpace( p, encoding );
786
        }
787

    
788
        // Was this empty?
789
        if ( !firstChild ) {
790
                SetError( TIXML_ERROR_DOCUMENT_EMPTY, 0, 0, encoding );
791
                return 0;
792
        }
793

    
794
        // All is well.
795
        return p;
796
}
797

    
798
void TiXmlDocument::SetError( int err, const char* pError, TiXmlParsingData* data, TiXmlEncoding encoding )
799
{        
800
        // The first error in a chain is more accurate - don't set again!
801
        if ( error )
802
                return;
803

    
804
        assert( err > 0 && err < TIXML_ERROR_STRING_COUNT );
805
        error   = true;
806
        errorId = err;
807
        errorDesc = errorString[ errorId ];
808

    
809
        errorLocation.Clear();
810
        if ( pError && data )
811
        {
812
                data->Stamp( pError, encoding );
813
                errorLocation = data->Cursor();
814
        }
815
}
816

    
817

    
818
TiXmlNode* TiXmlNode::Identify( const char* p, TiXmlEncoding encoding )
819
{
820
        TiXmlNode* returnNode = 0;
821

    
822
        p = SkipWhiteSpace( p, encoding );
823
        if( !p || !*p || *p != '<' )
824
        {
825
                return 0;
826
        }
827

    
828
        TiXmlDocument* doc = GetDocument();
829
        p = SkipWhiteSpace( p, encoding );
830

    
831
        if ( !p || !*p )
832
        {
833
                return 0;
834
        }
835

    
836
        // What is this thing? 
837
        // - Elements start with a letter or underscore, but xml is reserved.
838
        // - Comments: <!--
839
        // - Decleration: <?xml
840
        // - Everthing else is unknown to tinyxml.
841
        //
842

    
843
        const char* xmlHeader = { "<?xml" };
844
        const char* commentHeader = { "<!--" };
845
        const char* dtdHeader = { "<!" };
846
        const char* cdataHeader = { "<![CDATA[" };
847

    
848
        if ( StringEqual( p, xmlHeader, true, encoding ) )
849
        {
850
                #ifdef DEBUG_PARSER
851
                        TIXML_LOG( "XML parsing Declaration\n" );
852
                #endif
853
                returnNode = new TiXmlDeclaration();
854
        }
855
        else if ( StringEqual( p, commentHeader, false, encoding ) )
856
        {
857
                #ifdef DEBUG_PARSER
858
                        TIXML_LOG( "XML parsing Comment\n" );
859
                #endif
860
                returnNode = new TiXmlComment();
861
        }
862
        else if ( StringEqual( p, cdataHeader, false, encoding ) )
863
        {
864
                #ifdef DEBUG_PARSER
865
                        TIXML_LOG( "XML parsing CDATA\n" );
866
                #endif
867
                TiXmlText* text = new TiXmlText( "" );
868
                text->SetCDATA( true );
869
                returnNode = text;
870
        }
871
        else if ( StringEqual( p, dtdHeader, false, encoding ) )
872
        {
873
                #ifdef DEBUG_PARSER
874
                        TIXML_LOG( "XML parsing Unknown(1)\n" );
875
                #endif
876
                returnNode = new TiXmlUnknown();
877
        }
878
        else if (    IsAlpha( *(p+1), encoding )
879
                          || *(p+1) == '_' )
880
        {
881
                #ifdef DEBUG_PARSER
882
                        TIXML_LOG( "XML parsing Element\n" );
883
                #endif
884
                returnNode = new TiXmlElement( "" );
885
        }
886
        else
887
        {
888
                #ifdef DEBUG_PARSER
889
                        TIXML_LOG( "XML parsing Unknown(2)\n" );
890
                #endif
891
                returnNode = new TiXmlUnknown();
892
        }
893

    
894
        if ( returnNode )
895
        {
896
                // Set the parent, so it can report errors
897
                returnNode->parent = this;
898
        }
899
        else
900
        {
901
                if ( doc )
902
                        doc->SetError( TIXML_ERROR_OUT_OF_MEMORY, 0, 0, TIXML_ENCODING_UNKNOWN );
903
        }
904
        return returnNode;
905
}
906

    
907
#ifdef TIXML_USE_STL
908

    
909
void TiXmlElement::StreamIn (std::istream * in, TIXML_STRING * tag)
910
{
911
        // We're called with some amount of pre-parsing. That is, some of "this"
912
        // element is in "tag". Go ahead and stream to the closing ">"
913
        while( in->good() )
914
        {
915
                int c = in->get();
916
                if ( c <= 0 )
917
                {
918
                        TiXmlDocument* document = GetDocument();
919
                        if ( document )
920
                                document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
921
                        return;
922
                }
923
                (*tag) += (char) c ;
924
                
925
                if ( c == '>' )
926
                        break;
927
        }
928

    
929
        if ( tag->length() < 3 ) return;
930

    
931
        // Okay...if we are a "/>" tag, then we're done. We've read a complete tag.
932
        // If not, identify and stream.
933

    
934
        if (    tag->at( tag->length() - 1 ) == '>' 
935
                 && tag->at( tag->length() - 2 ) == '/' )
936
        {
937
                // All good!
938
                return;
939
        }
940
        else if ( tag->at( tag->length() - 1 ) == '>' )
941
        {
942
                // There is more. Could be:
943
                //                text
944
                //                cdata text (which looks like another node)
945
                //                closing tag
946
                //                another node.
947
                for ( ;; )
948
                {
949
                        StreamWhiteSpace( in, tag );
950

    
951
                        // Do we have text?
952
                        if ( in->good() && in->peek() != '<' ) 
953
                        {
954
                                // Yep, text.
955
                                TiXmlText text( "" );
956
                                text.StreamIn( in, tag );
957

    
958
                                // What follows text is a closing tag or another node.
959
                                // Go around again and figure it out.
960
                                continue;
961
                        }
962

    
963
                        // We now have either a closing tag...or another node.
964
                        // We should be at a "<", regardless.
965
                        if ( !in->good() ) return;
966
                        assert( in->peek() == '<' );
967
                        int tagIndex = (int) tag->length();
968

    
969
                        bool closingTag = false;
970
                        bool firstCharFound = false;
971

    
972
                        for( ;; )
973
                        {
974
                                if ( !in->good() )
975
                                        return;
976

    
977
                                int c = in->peek();
978
                                if ( c <= 0 )
979
                                {
980
                                        TiXmlDocument* document = GetDocument();
981
                                        if ( document )
982
                                                document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
983
                                        return;
984
                                }
985
                                
986
                                if ( c == '>' )
987
                                        break;
988

    
989
                                *tag += (char) c;
990
                                in->get();
991

    
992
                                // Early out if we find the CDATA id.
993
                                if ( c == '[' && tag->size() >= 9 )
994
                                {
995
                                        size_t len = tag->size();
996
                                        const char* start = tag->c_str() + len - 9;
997
                                        if ( strcmp( start, "<![CDATA[" ) == 0 ) {
998
                                                assert( !closingTag );
999
                                                break;
1000
                                        }
1001
                                }
1002

    
1003
                                if ( !firstCharFound && c != '<' && !IsWhiteSpace( c ) )
1004
                                {
1005
                                        firstCharFound = true;
1006
                                        if ( c == '/' )
1007
                                                closingTag = true;
1008
                                }
1009
                        }
1010
                        // If it was a closing tag, then read in the closing '>' to clean up the input stream.
1011
                        // If it was not, the streaming will be done by the tag.
1012
                        if ( closingTag )
1013
                        {
1014
                                if ( !in->good() )
1015
                                        return;
1016

    
1017
                                int c = in->get();
1018
                                if ( c <= 0 )
1019
                                {
1020
                                        TiXmlDocument* document = GetDocument();
1021
                                        if ( document )
1022
                                                document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
1023
                                        return;
1024
                                }
1025
                                assert( c == '>' );
1026
                                *tag += (char) c;
1027

    
1028
                                // We are done, once we've found our closing tag.
1029
                                return;
1030
                        }
1031
                        else
1032
                        {
1033
                                // If not a closing tag, id it, and stream.
1034
                                const char* tagloc = tag->c_str() + tagIndex;
1035
                                TiXmlNode* node = Identify( tagloc, TIXML_DEFAULT_ENCODING );
1036
                                if ( !node )
1037
                                        return;
1038
                                node->StreamIn( in, tag );
1039
                                delete node;
1040
                                node = 0;
1041

    
1042
                                // No return: go around from the beginning: text, closing tag, or node.
1043
                        }
1044
                }
1045
        }
1046
}
1047
#endif
1048

    
1049
const char* TiXmlElement::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
1050
{
1051
        p = SkipWhiteSpace( p, encoding );
1052
        TiXmlDocument* document = GetDocument();
1053

    
1054
        if ( !p || !*p )
1055
        {
1056
                if ( document ) document->SetError( TIXML_ERROR_PARSING_ELEMENT, 0, 0, encoding );
1057
                return 0;
1058
        }
1059

    
1060
        if ( data )
1061
        {
1062
                data->Stamp( p, encoding );
1063
                location = data->Cursor();
1064
        }
1065

    
1066
        if ( *p != '<' )
1067
        {
1068
                if ( document ) document->SetError( TIXML_ERROR_PARSING_ELEMENT, p, data, encoding );
1069
                return 0;
1070
        }
1071

    
1072
        p = SkipWhiteSpace( p+1, encoding );
1073

    
1074
        // Read the name.
1075
        const char* pErr = p;
1076

    
1077
    p = ReadName( p, &value, encoding );
1078
        if ( !p || !*p )
1079
        {
1080
                if ( document )        document->SetError( TIXML_ERROR_FAILED_TO_READ_ELEMENT_NAME, pErr, data, encoding );
1081
                return 0;
1082
        }
1083

    
1084
    TIXML_STRING endTag ("</");
1085
        endTag += value;
1086
        endTag += ">";
1087

    
1088
        // Check for and read attributes. Also look for an empty
1089
        // tag or an end tag.
1090
        while ( p && *p )
1091
        {
1092
                pErr = p;
1093
                p = SkipWhiteSpace( p, encoding );
1094
                if ( !p || !*p )
1095
                {
1096
                        if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, pErr, data, encoding );
1097
                        return 0;
1098
                }
1099
                if ( *p == '/' )
1100
                {
1101
                        ++p;
1102
                        // Empty tag.
1103
                        if ( *p  != '>' )
1104
                        {
1105
                                if ( document ) document->SetError( TIXML_ERROR_PARSING_EMPTY, p, data, encoding );                
1106
                                return 0;
1107
                        }
1108
                        return (p+1);
1109
                }
1110
                else if ( *p == '>' )
1111
                {
1112
                        // Done with attributes (if there were any.)
1113
                        // Read the value -- which can include other
1114
                        // elements -- read the end tag, and return.
1115
                        ++p;
1116
                        p = ReadValue( p, data, encoding );                // Note this is an Element method, and will set the error if one happens.
1117
                        if ( !p || !*p ) {
1118
                                // We were looking for the end tag, but found nothing.
1119
                                // Fix for [ 1663758 ] Failure to report error on bad XML
1120
                                if ( document ) document->SetError( TIXML_ERROR_READING_END_TAG, p, data, encoding );
1121
                                return 0;
1122
                        }
1123

    
1124
                        // We should find the end tag now
1125
                        if ( StringEqual( p, endTag.c_str(), false, encoding ) )
1126
                        {
1127
                                p += endTag.length();
1128
                                return p;
1129
                        }
1130
                        else
1131
                        {
1132
                                if ( document ) document->SetError( TIXML_ERROR_READING_END_TAG, p, data, encoding );
1133
                                return 0;
1134
                        }
1135
                }
1136
                else
1137
                {
1138
                        // Try to read an attribute:
1139
                        TiXmlAttribute* attrib = new TiXmlAttribute();
1140
                        if ( !attrib )
1141
                        {
1142
                                if ( document ) document->SetError( TIXML_ERROR_OUT_OF_MEMORY, pErr, data, encoding );
1143
                                return 0;
1144
                        }
1145

    
1146
                        attrib->SetDocument( document );
1147
                        pErr = p;
1148
                        p = attrib->Parse( p, data, encoding );
1149

    
1150
                        if ( !p || !*p )
1151
                        {
1152
                                if ( document ) document->SetError( TIXML_ERROR_PARSING_ELEMENT, pErr, data, encoding );
1153
                                delete attrib;
1154
                                return 0;
1155
                        }
1156

    
1157
                        // Handle the strange case of double attributes:
1158
                        #ifdef TIXML_USE_STL
1159
                        TiXmlAttribute* node = attributeSet.Find( attrib->NameTStr() );
1160
                        #else
1161
                        TiXmlAttribute* node = attributeSet.Find( attrib->Name() );
1162
                        #endif
1163
                        if ( node )
1164
                        {
1165
                                node->SetValue( attrib->Value() );
1166
                                delete attrib;
1167
                                return 0;
1168
                        }
1169

    
1170
                        attributeSet.Add( attrib );
1171
                }
1172
        }
1173
        return p;
1174
}
1175

    
1176

    
1177
const char* TiXmlElement::ReadValue( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
1178
{
1179
        TiXmlDocument* document = GetDocument();
1180

    
1181
        // Read in text and elements in any order.
1182
        const char* pWithWhiteSpace = p;
1183
        p = SkipWhiteSpace( p, encoding );
1184

    
1185
        while ( p && *p )
1186
        {
1187
                if ( *p != '<' )
1188
                {
1189
                        // Take what we have, make a text element.
1190
                        TiXmlText* textNode = new TiXmlText( "" );
1191

    
1192
                        if ( !textNode )
1193
                        {
1194
                                if ( document ) document->SetError( TIXML_ERROR_OUT_OF_MEMORY, 0, 0, encoding );
1195
                                    return 0;
1196
                        }
1197

    
1198
                        if ( TiXmlBase::IsWhiteSpaceCondensed() )
1199
                        {
1200
                                p = textNode->Parse( p, data, encoding );
1201
                        }
1202
                        else
1203
                        {
1204
                                // Special case: we want to keep the white space
1205
                                // so that leading spaces aren't removed.
1206
                                p = textNode->Parse( pWithWhiteSpace, data, encoding );
1207
                        }
1208

    
1209
                        if ( !textNode->Blank() )
1210
                                LinkEndChild( textNode );
1211
                        else
1212
                                delete textNode;
1213
                } 
1214
                else 
1215
                {
1216
                        // We hit a '<'
1217
                        // Have we hit a new element or an end tag? This could also be
1218
                        // a TiXmlText in the "CDATA" style.
1219
                        if ( StringEqual( p, "</", false, encoding ) )
1220
                        {
1221
                                return p;
1222
                        }
1223
                        else
1224
                        {
1225
                                TiXmlNode* node = Identify( p, encoding );
1226
                                if ( node )
1227
                                {
1228
                                        p = node->Parse( p, data, encoding );
1229
                                        LinkEndChild( node );
1230
                                }                                
1231
                                else
1232
                                {
1233
                                        return 0;
1234
                                }
1235
                        }
1236
                }
1237
                pWithWhiteSpace = p;
1238
                p = SkipWhiteSpace( p, encoding );
1239
        }
1240

    
1241
        if ( !p )
1242
        {
1243
                if ( document ) document->SetError( TIXML_ERROR_READING_ELEMENT_VALUE, 0, 0, encoding );
1244
        }        
1245
        return p;
1246
}
1247

    
1248

    
1249
#ifdef TIXML_USE_STL
1250
void TiXmlUnknown::StreamIn( std::istream * in, TIXML_STRING * tag )
1251
{
1252
        while ( in->good() )
1253
        {
1254
                int c = in->get();        
1255
                if ( c <= 0 )
1256
                {
1257
                        TiXmlDocument* document = GetDocument();
1258
                        if ( document )
1259
                                document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
1260
                        return;
1261
                }
1262
                (*tag) += (char) c;
1263

    
1264
                if ( c == '>' )
1265
                {
1266
                        // All is well.
1267
                        return;                
1268
                }
1269
        }
1270
}
1271
#endif
1272

    
1273

    
1274
const char* TiXmlUnknown::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
1275
{
1276
        TiXmlDocument* document = GetDocument();
1277
        p = SkipWhiteSpace( p, encoding );
1278

    
1279
        if ( data )
1280
        {
1281
                data->Stamp( p, encoding );
1282
                location = data->Cursor();
1283
        }
1284
        if ( !p || !*p || *p != '<' )
1285
        {
1286
                if ( document ) document->SetError( TIXML_ERROR_PARSING_UNKNOWN, p, data, encoding );
1287
                return 0;
1288
        }
1289
        ++p;
1290
    value = "";
1291

    
1292
        while ( p && *p && *p != '>' )
1293
        {
1294
                value += *p;
1295
                ++p;
1296
        }
1297

    
1298
        if ( !p )
1299
        {
1300
                if ( document )        document->SetError( TIXML_ERROR_PARSING_UNKNOWN, 0, 0, encoding );
1301
        }
1302
        if ( *p == '>' )
1303
                return p+1;
1304
        return p;
1305
}
1306

    
1307
#ifdef TIXML_USE_STL
1308
void TiXmlComment::StreamIn( std::istream * in, TIXML_STRING * tag )
1309
{
1310
        while ( in->good() )
1311
        {
1312
                int c = in->get();        
1313
                if ( c <= 0 )
1314
                {
1315
                        TiXmlDocument* document = GetDocument();
1316
                        if ( document )
1317
                                document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
1318
                        return;
1319
                }
1320

    
1321
                (*tag) += (char) c;
1322

    
1323
                if ( c == '>' 
1324
                         && tag->at( tag->length() - 2 ) == '-'
1325
                         && tag->at( tag->length() - 3 ) == '-' )
1326
                {
1327
                        // All is well.
1328
                        return;                
1329
                }
1330
        }
1331
}
1332
#endif
1333

    
1334

    
1335
const char* TiXmlComment::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
1336
{
1337
        TiXmlDocument* document = GetDocument();
1338
        value = "";
1339

    
1340
        p = SkipWhiteSpace( p, encoding );
1341

    
1342
        if ( data )
1343
        {
1344
                data->Stamp( p, encoding );
1345
                location = data->Cursor();
1346
        }
1347
        const char* startTag = "<!--";
1348
        const char* endTag   = "-->";
1349

    
1350
        if ( !StringEqual( p, startTag, false, encoding ) )
1351
        {
1352
                document->SetError( TIXML_ERROR_PARSING_COMMENT, p, data, encoding );
1353
                return 0;
1354
        }
1355
        p += strlen( startTag );
1356

    
1357
        // [ 1475201 ] TinyXML parses entities in comments
1358
        // Oops - ReadText doesn't work, because we don't want to parse the entities.
1359
        // p = ReadText( p, &value, false, endTag, false, encoding );
1360
        //
1361
        // from the XML spec:
1362
        /*
1363
         [Definition: Comments may appear anywhere in a document outside other markup; in addition, 
1364
                      they may appear within the document type declaration at places allowed by the grammar. 
1365
                                  They are not part of the document's character data; an XML processor MAY, but need not, 
1366
                                  make it possible for an application to retrieve the text of comments. For compatibility, 
1367
                                  the string "--" (double-hyphen) MUST NOT occur within comments.] Parameter entity 
1368
                                  references MUST NOT be recognized within comments.
1369

    
1370
                                  An example of a comment:
1371

    
1372
                                  <!-- declarations for <head> & <body> -->
1373
        */
1374

    
1375
    value = "";
1376
        // Keep all the white space.
1377
        while (        p && *p && !StringEqual( p, endTag, false, encoding ) )
1378
        {
1379
                value.append( p, 1 );
1380
                ++p;
1381
        }
1382
        if ( p ) 
1383
                p += strlen( endTag );
1384

    
1385
        return p;
1386
}
1387

    
1388

    
1389
const char* TiXmlAttribute::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
1390
{
1391
        p = SkipWhiteSpace( p, encoding );
1392
        if ( !p || !*p ) return 0;
1393

    
1394
//        int tabsize = 4;
1395
//        if ( document )
1396
//                tabsize = document->TabSize();
1397

    
1398
        if ( data )
1399
        {
1400
                data->Stamp( p, encoding );
1401
                location = data->Cursor();
1402
        }
1403
        // Read the name, the '=' and the value.
1404
        const char* pErr = p;
1405
        p = ReadName( p, &name, encoding );
1406
        if ( !p || !*p )
1407
        {
1408
                if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, pErr, data, encoding );
1409
                return 0;
1410
        }
1411
        p = SkipWhiteSpace( p, encoding );
1412
        if ( !p || !*p || *p != '=' )
1413
        {
1414
                if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, p, data, encoding );
1415
                return 0;
1416
        }
1417

    
1418
        ++p;        // skip '='
1419
        p = SkipWhiteSpace( p, encoding );
1420
        if ( !p || !*p )
1421
        {
1422
                if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, p, data, encoding );
1423
                return 0;
1424
        }
1425
        
1426
        const char* end;
1427
        const char SINGLE_QUOTE = '\'';
1428
        const char DOUBLE_QUOTE = '\"';
1429

    
1430
        if ( *p == SINGLE_QUOTE )
1431
        {
1432
                ++p;
1433
                end = "\'";                // single quote in string
1434
                p = ReadText( p, &value, false, end, false, encoding );
1435
        }
1436
        else if ( *p == DOUBLE_QUOTE )
1437
        {
1438
                ++p;
1439
                end = "\"";                // double quote in string
1440
                p = ReadText( p, &value, false, end, false, encoding );
1441
        }
1442
        else
1443
        {
1444
                // All attribute values should be in single or double quotes.
1445
                // But this is such a common error that the parser will try
1446
                // its best, even without them.
1447
                value = "";
1448
                while (    p && *p                                                                                        // existence
1449
                                && !IsWhiteSpace( *p ) && *p != '\n' && *p != '\r'        // whitespace
1450
                                && *p != '/' && *p != '>' )                                                        // tag end
1451
                {
1452
                        if ( *p == SINGLE_QUOTE || *p == DOUBLE_QUOTE ) {
1453
                                // [ 1451649 ] Attribute values with trailing quotes not handled correctly
1454
                                // We did not have an opening quote but seem to have a 
1455
                                // closing one. Give up and throw an error.
1456
                                if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, p, data, encoding );
1457
                                return 0;
1458
                        }
1459
                        value += *p;
1460
                        ++p;
1461
                }
1462
        }
1463
        return p;
1464
}
1465

    
1466
#ifdef TIXML_USE_STL
1467
void TiXmlText::StreamIn( std::istream * in, TIXML_STRING * tag )
1468
{
1469
        while ( in->good() )
1470
        {
1471
                int c = in->peek();        
1472
                if ( !cdata && (c == '<' ) ) 
1473
                {
1474
                        return;
1475
                }
1476
                if ( c <= 0 )
1477
                {
1478
                        TiXmlDocument* document = GetDocument();
1479
                        if ( document )
1480
                                document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
1481
                        return;
1482
                }
1483

    
1484
                (*tag) += (char) c;
1485
                in->get();        // "commits" the peek made above
1486

    
1487
                if ( cdata && c == '>' && tag->size() >= 3 ) {
1488
                        size_t len = tag->size();
1489
                        if ( (*tag)[len-2] == ']' && (*tag)[len-3] == ']' ) {
1490
                                // terminator of cdata.
1491
                                return;
1492
                        }
1493
                }    
1494
        }
1495
}
1496
#endif
1497

    
1498
const char* TiXmlText::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
1499
{
1500
        value = "";
1501
        TiXmlDocument* document = GetDocument();
1502

    
1503
        if ( data )
1504
        {
1505
                data->Stamp( p, encoding );
1506
                location = data->Cursor();
1507
        }
1508

    
1509
        const char* const startTag = "<![CDATA[";
1510
        const char* const endTag   = "]]>";
1511

    
1512
        if ( cdata || StringEqual( p, startTag, false, encoding ) )
1513
        {
1514
                cdata = true;
1515

    
1516
                if ( !StringEqual( p, startTag, false, encoding ) )
1517
                {
1518
                        document->SetError( TIXML_ERROR_PARSING_CDATA, p, data, encoding );
1519
                        return 0;
1520
                }
1521
                p += strlen( startTag );
1522

    
1523
                // Keep all the white space, ignore the encoding, etc.
1524
                while (           p && *p
1525
                                && !StringEqual( p, endTag, false, encoding )
1526
                          )
1527
                {
1528
                        value += *p;
1529
                        ++p;
1530
                }
1531

    
1532
                TIXML_STRING dummy; 
1533
                p = ReadText( p, &dummy, false, endTag, false, encoding );
1534
                return p;
1535
        }
1536
        else
1537
        {
1538
                bool ignoreWhite = true;
1539

    
1540
                const char* end = "<";
1541
                p = ReadText( p, &value, ignoreWhite, end, false, encoding );
1542
                if ( p )
1543
                        return p-1;        // don't truncate the '<'
1544
                return 0;
1545
        }
1546
}
1547

    
1548
#ifdef TIXML_USE_STL
1549
void TiXmlDeclaration::StreamIn( std::istream * in, TIXML_STRING * tag )
1550
{
1551
        while ( in->good() )
1552
        {
1553
                int c = in->get();
1554
                if ( c <= 0 )
1555
                {
1556
                        TiXmlDocument* document = GetDocument();
1557
                        if ( document )
1558
                                document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
1559
                        return;
1560
                }
1561
                (*tag) += (char) c;
1562

    
1563
                if ( c == '>' )
1564
                {
1565
                        // All is well.
1566
                        return;
1567
                }
1568
        }
1569
}
1570
#endif
1571

    
1572
const char* TiXmlDeclaration::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding _encoding )
1573
{
1574
        p = SkipWhiteSpace( p, _encoding );
1575
        // Find the beginning, find the end, and look for
1576
        // the stuff in-between.
1577
        TiXmlDocument* document = GetDocument();
1578
        if ( !p || !*p || !StringEqual( p, "<?xml", true, _encoding ) )
1579
        {
1580
                if ( document ) document->SetError( TIXML_ERROR_PARSING_DECLARATION, 0, 0, _encoding );
1581
                return 0;
1582
        }
1583
        if ( data )
1584
        {
1585
                data->Stamp( p, _encoding );
1586
                location = data->Cursor();
1587
        }
1588
        p += 5;
1589

    
1590
        version = "";
1591
        encoding = "";
1592
        standalone = "";
1593

    
1594
        while ( p && *p )
1595
        {
1596
                if ( *p == '>' )
1597
                {
1598
                        ++p;
1599
                        return p;
1600
                }
1601

    
1602
                p = SkipWhiteSpace( p, _encoding );
1603
                if ( StringEqual( p, "version", true, _encoding ) )
1604
                {
1605
                        TiXmlAttribute attrib;
1606
                        p = attrib.Parse( p, data, _encoding );                
1607
                        version = attrib.Value();
1608
                }
1609
                else if ( StringEqual( p, "encoding", true, _encoding ) )
1610
                {
1611
                        TiXmlAttribute attrib;
1612
                        p = attrib.Parse( p, data, _encoding );                
1613
                        encoding = attrib.Value();
1614
                }
1615
                else if ( StringEqual( p, "standalone", true, _encoding ) )
1616
                {
1617
                        TiXmlAttribute attrib;
1618
                        p = attrib.Parse( p, data, _encoding );                
1619
                        standalone = attrib.Value();
1620
                }
1621
                else
1622
                {
1623
                        // Read over whatever it is.
1624
                        while( p && *p && *p != '>' && !IsWhiteSpace( *p ) )
1625
                                ++p;
1626
                }
1627
        }
1628
        return 0;
1629
}
1630

    
1631
bool TiXmlText::Blank() const
1632
{
1633
        for ( unsigned i=0; i<value.length(); i++ )
1634
                if ( !IsWhiteSpace( value[i] ) )
1635
                        return false;
1636
        return true;
1637
}
1638