bxmlparser.c 34 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092
  1. /* My own XML parser. For what good it is :p ... */
  2. #include <stdlib.h>
  3. #include <string.h>
  4. #include "ses.h"
  5. #include "bxml.h"
  6. #include "bxmlparser.h"
  7. #define BXML_WS " \t\n\r"
  8. #define BXML_LOWER "abcdefghijklmnopqrstuvwxyz"
  9. #define BXML_UPPER "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
  10. #define BXML_ALPHA BXML_LOWER BXML_UPPER
  11. #define BXML_NUM "0123456789"
  12. #define BXML_ALNUM BXML_ALPHA BXML_NUM
  13. /* According to the XML standard, these are spaces */
  14. static const char BXML_SPACE_STR[] = { 0x20, 0x09, 0x0D, 0x0A, 0x00 };
  15. /* State of the parser. Negative states indicate errors.
  16. * State zero means the parser is done. States greater than to
  17. * bxml_FOUND indicate that the parser has found the corresponding tag.
  18. */
  19. enum BxmlState_ {
  20. bxml_STATE_MEMERROR = -3,
  21. bxml_STATE_STACKERROR = -2,
  22. bxml_STATE_ERROR = -1,
  23. bxml_STATE_DONE = 0,
  24. bxml_STATE_START = 1,
  25. bxml_STATE_TAGSTART = 2,
  26. bxml_STATE_TAGNAME = 3,
  27. bxml_STATE_ATTRLIST = 4,
  28. bxml_STATE_ATTRSTART = 5,
  29. bxml_STATE_ATTRNAME = 6,
  30. bxml_STATE_VALSTART = 7,
  31. bxml_STATE_SQVALUE = 8,
  32. bxml_STATE_DQVALUE = 9,
  33. bxml_STATE_TAGEND = 10,
  34. bxml_STATE_TEXT = 11,
  35. bxml_STATE_VALENTITY = 12,
  36. bxml_STATE_TEXTENTITY = 13,
  37. bxml_STATE_COMMENT = 14,
  38. bxml_STATE_DECLARE = 15,
  39. bxml_STATE_PROCESSING = 16,
  40. bxml_STATE_CDATA = 17
  41. };
  42. typedef enum BxmlState_ BxmlState;
  43. #define BXML_PARSER_STACKSIZE 1024
  44. /*
  45. * BxmlParse is the parser object. For simplicity, the parser works on a string
  46. * with the whole xml document in memory. Not very efficient, but easier to parse.
  47. */
  48. struct BxmlParser_ {
  49. Swis buffer;
  50. char * index;
  51. int line;
  52. int col;
  53. int now;
  54. int stack[BXML_PARSER_STACKSIZE];
  55. int sp;
  56. Bxml * tag;
  57. Bxml * root;
  58. const char* error;
  59. };
  60. BxmlParser * bxmlparser_alloc(void) {
  61. return calloc(1, sizeof(BxmlParser));
  62. }
  63. /** Initializes a parser. */
  64. BxmlParser * bxmlparser_init(BxmlParser * me) {
  65. if (!me) return NULL;
  66. if (!swis_new_empty(&me->buffer)) return NULL;
  67. me->index = me->buffer.text;
  68. me->line = 0;
  69. me->col = 0;
  70. me->now = 0;
  71. me->sp = 0;
  72. me->tag = NULL;
  73. me->root = NULL;
  74. me->error = NULL;
  75. return me;
  76. }
  77. BxmlParser * bxmlparser_new() {
  78. return bxmlparser_init(bxmlparser_alloc());
  79. }
  80. /** Cleans up a parser after use. */
  81. BxmlParser * bxmlparser_done(BxmlParser * me) {
  82. if (!me) return NULL;
  83. swis_free(&me->buffer);
  84. me->index = NULL;
  85. me->line = 0;
  86. me->col = 0;
  87. me->now = 0;
  88. me->sp = 0;
  89. me->tag = NULL;
  90. me->root = NULL;
  91. return me;
  92. }
  93. BxmlParser * bxmlparser_free(BxmlParser * me) {
  94. bxmlparser_done(me);
  95. free(me);
  96. return NULL;
  97. }
  98. /* Adds the tag, either as root if no root is set, or as a child of the
  99. * current tag. */
  100. Bxml * bxmlparser_add_tag_size(BxmlParser * me, int kind, int size, ...) {
  101. Bxml * newtag;
  102. va_list args;
  103. va_start(args, size);
  104. newtag = bxml_new_size_va(kind, size, args);
  105. va_end(args);
  106. if (!newtag) return NULL;
  107. if (me->tag) {
  108. bxml_add_child(me->tag, newtag);
  109. } else if (me->root) {
  110. bxml_add_child(me->root, newtag);
  111. } else {
  112. me->root = newtag;
  113. }
  114. return newtag;
  115. }
  116. /* Returns nonzero if the parser is at the end, false if not */
  117. int bxmlparser_is_end(BxmlParser * me) {
  118. if ((me->buffer.text + me->buffer.size) == (me->index)) return TRUE;
  119. if ('\0' == (*(me->index))) return TRUE;
  120. return FALSE;
  121. }
  122. /* Gets curent byte, or '\0' if at end. */
  123. int bxmlparser_now(BxmlParser * me) {
  124. if (bxmlparser_is_end(me)) return '\0';
  125. return *(me->index);
  126. }
  127. /* Gets curent index pointer location. */
  128. char * bxmlparser_index(BxmlParser * me) {
  129. return me->index;
  130. }
  131. /* Advances the parser by one byte (and returns the
  132. * the advance byte). Returns \0 if at end.*/
  133. int bxmlparser_next(BxmlParser * me) {
  134. int now = bxmlparser_now(me);
  135. if (bxmlparser_is_end(me)) return '\0';
  136. if (now != '\0') {
  137. me->index++;
  138. if (now == '\n') {
  139. me->line++;
  140. me->col = 1;
  141. } else {
  142. me->col++;
  143. }
  144. }
  145. return bxmlparser_now(me);
  146. }
  147. /* Skips n characters or until end of string . */
  148. int bxmlparser_skip(BxmlParser * me, int amount) {
  149. int now;
  150. int skipped = 0;
  151. if (bxmlparser_is_end(me)) return -1;
  152. for (now = bxmlparser_now(me); now != '\0' ; now = bxmlparser_next(me)) {
  153. if (skipped == amount) return skipped;
  154. skipped++;
  155. }
  156. return skipped;
  157. }
  158. /* Skips all characters in the set. */
  159. int bxmlparser_skip_in(BxmlParser * me, char * set) {
  160. int now;
  161. int skipped = 0;
  162. if (bxmlparser_is_end(me)) return -1;
  163. for (now = bxmlparser_now(me); now != '\0' ; now = bxmlparser_next(me)) {
  164. if (!strchr(set, now)) return skipped;
  165. skipped++;
  166. }
  167. return skipped;
  168. }
  169. int bxmlparser_skip_not_in(BxmlParser * me, char * set) {
  170. int now;
  171. int skipped = 0;
  172. if (bxmlparser_is_end(me)) return -1;
  173. for (now = bxmlparser_now(me); now != '\0' ; now = bxmlparser_next(me)) {
  174. if (strchr(set, now)) return skipped;
  175. skipped++;
  176. }
  177. return skipped;
  178. }
  179. /* Skips xml whitespace */
  180. int bxmlparser_skip_ws(BxmlParser * me) {
  181. return bxmlparser_skip_in(me, BXML_WS);
  182. }
  183. /* Checks if the current character is inb the given set . */
  184. int bxmlparser_now_in(BxmlParser * me, char * set) {
  185. if (bxmlparser_is_end(me)) return 0;
  186. return (!!strchr(set, bxmlparser_now(me)));
  187. }
  188. /* Checks if the current character is inb the given set . */
  189. int bxmlparser_now_not_in(BxmlParser * me, char * set) {
  190. if (bxmlparser_is_end(me)) return 0;
  191. return (!strchr(set, bxmlparser_now(me)));
  192. }
  193. /* Checks if the current character is whitespace . */
  194. int bxmlparser_now_ws(BxmlParser * me) {
  195. return bxmlparser_now_in(me, BXML_WS);
  196. }
  197. /* Checks if the current location starts with the given prefix */
  198. int bxmlparser_have_prefix(BxmlParser * me, char * prefix) {
  199. if (bxmlparser_is_end(me)) return FALSE;
  200. return (strncmp(me->index, prefix, strlen(prefix)) == 0);
  201. }
  202. /* Tries to find the next "suffix" at or after the current parser position.
  203. * Returns the relative position index of that suffix, or negative if not found.
  204. */
  205. int bxmlparser_find_suffix(BxmlParser * me, char * suffix) {
  206. char * aid;
  207. aid = strstr(me->index, suffix);
  208. if (aid) {
  209. return aid - me->index;
  210. } else {
  211. return -1;
  212. }
  213. }
  214. /* Skips until a given suffix is skipped. Returns true if he suffix was found,
  215. * ro false if not. */
  216. int bxmlparser_skip_suffix(BxmlParser * me, char * suffix) {
  217. int aid;
  218. aid = bxmlparser_find_suffix(me, suffix);
  219. if (aid < 0) return 0;
  220. /* Aslo skip suffix itself! */
  221. aid += strlen(suffix);
  222. return (bxmlparser_skip(me, aid) == aid);
  223. }
  224. /* Skips until a given prefix is skipped. Returns true if the prefix was found,
  225. * or false if not. */
  226. int bxmlparser_skip_prefix(BxmlParser * me, char * prefix) {
  227. int aid;
  228. if (!bxmlparser_have_prefix(me, prefix)) return 0;
  229. aid = strlen(prefix);
  230. if (aid < 0) return 0;
  231. return (bxmlparser_skip(me, aid) == aid);
  232. }
  233. BxmlParser * bxmlparser_set_error(BxmlParser * me, const char * error) {
  234. me->error = error;
  235. return NULL;
  236. }
  237. /*
  238. Here's is an overview of the syntax of full XML.
  239. My parser will only support a subset of that.
  240. [1] document ::= prolog element Misc*
  241. [2] Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
  242. ( any Unicode character, excluding the surrogate blocks, FFFE, and FFFF. )
  243. [3] S ::= (#x20 | #x9 | #xD | #xA)+
  244. (
  245. The presence of #xD in the above production is maintained purely for backward compatibility with the First Edition. As explained in 2.11 End-of-Line Handling, all #xD characters literally present in an XML document are either removed or replaced by #xA characters before any other processing is done. The only way to get a #xD character to match this production is to use a character reference in an entity value literal.
  246. )
  247. [4] NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF]
  248. [4a] NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040]
  249. [5] Name ::= NameStartChar (NameChar)*
  250. [6] Names ::= Name (#x20 Name)*
  251. [7] Nmtoken ::= (NameChar)+
  252. [8] Nmtokens ::= Nmtoken (#x20 Nmtoken)*
  253. Literals
  254. [9] EntityValue ::= '"' ([^%&"] | PEReference | Reference)* '"'
  255. | "'" ([^%&'] | PEReference | Reference)* "'"
  256. [10] AttValue ::= '"' ([^<&"] | Reference)* '"'
  257. | "'" ([^<&'] | Reference)* "'"
  258. [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
  259. [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
  260. [13] PubidChar ::= #x20 | #xD | #xA | [a-zA-Z0-9] | [-'()+,./:=?;!*#@$_%]
  261. Character Data
  262. [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
  263. Comments
  264. [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
  265. [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
  266. [17] PITarget ::= Name - (('X' | 'x') ('M' | 'm') ('L' | 'l'))
  267. [18] CDSect ::= CDStart CData CDEnd
  268. [19] CDStart ::= '<![CDATA['
  269. [20] CData ::= (Char* - (Char* ']]>' Char*))
  270. [21] CDEnd ::= ']]>'
  271. Prolog
  272. [22] prolog ::= XMLDecl? Misc* (doctypedecl Misc*)?
  273. [23] XMLDecl ::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>'
  274. [24] VersionInfo ::= S 'version' Eq ("'" VersionNum "'" | '"' VersionNum '"')
  275. [25] Eq ::= S? '=' S?
  276. [26] VersionNum ::= '1.' [0-9]+
  277. [27] Misc ::= Comment | PI | S
  278. [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S? ('[' intSubset ']' S?)? '>' [VC: Root Element Type]
  279. [WFC: External Subset]
  280. [28a] DeclSep ::= PEReference | S [WFC: PE Between Declarations]
  281. [28b] intSubset ::= (markupdecl | DeclSep)*
  282. [29] markupdecl ::= elementdecl | AttlistDecl | EntityDecl | NotationDecl | PI | Comment [VC: Proper Declaration/PE Nesting]
  283. [WFC: PEs in Internal Subset]
  284. External Subset
  285. [30] extSubset ::= TextDecl? extSubsetDecl
  286. [31] extSubsetDecl ::= ( markupdecl | conditionalSect | DeclSep)*
  287. Standalone Document Declaration
  288. [32] SDDecl ::= S 'standalone' Eq (("'" ('yes' | 'no') "'") | ('"' ('yes' | 'no') '"')) [VC: Standalone Document Declaration]
  289. NOTE:
  290. To simplify the tasks of applications, the XML processor MUST behave as if it normalized all line breaks in external parsed entities (including the document entity) on input, before parsing, by translating both the two-character sequence #xD #xA and any #xD that is not followed by #xA to a single #xA character.
  291. Element
  292. [39] element ::= EmptyElemTag
  293. | STag content ETag [WFC: Element Type Match]
  294. [VC: Element Valid]
  295. Start-tag
  296. [40] STag ::= '<' Name (S Attribute)* S? '>' [WFC: Unique Att Spec]
  297. [41] Attribute ::= Name Eq AttValue [VC: Attribute Value Type]
  298. [WFC: No External Entity References]
  299. [WFC: No < in Attribute Values]
  300. End-tag
  301. [42] ETag ::= '</' Name S? '>'
  302. Content of Elements
  303. [43] content ::= CharData? ((element | Reference | CDSect | PI | Comment) CharData?)*
  304. Tags for Empty Elements
  305. [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>' [WFC: Unique Att Spec]
  306. Element Type Declaration
  307. [45] elementdecl ::= '<!ELEMENT' S Name S contentspec S? '>' [VC: Unique Element Type Declaration]
  308. [46] contentspec ::= 'EMPTY' | 'ANY' | Mixed | children
  309. Element-content Models
  310. [47] children ::= (choice | seq) ('?' | '*' | '+')?
  311. [48] cp ::= (Name | choice | seq) ('?' | '*' | '+')?
  312. [49] choice ::= '(' S? cp ( S? '|' S? cp )+ S? ')' [VC: Proper Group/PE Nesting]
  313. [50] seq ::= '(' S? cp ( S? ',' S? cp )* S? ')' [VC: Proper Group/PE Nesting]
  314. Mixed-content Declaration
  315. [51] Mixed ::= '(' S? '#PCDATA' (S? '|' S? Name)* S? ')*'
  316. | '(' S? '#PCDATA' S? ')' [VC: Proper Group/PE Nesting]
  317. [VC: No Duplicate Types]
  318. Attribute-list Declaration
  319. [52] AttlistDecl ::= '<!ATTLIST' S Name AttDef* S? '>'
  320. [53] AttDef ::= S Name S AttType S DefaultDecl
  321. [54] AttType ::= StringType | TokenizedType | EnumeratedType
  322. [55] StringType ::= 'CDATA'
  323. [56] TokenizedType ::= 'ID' [VC: ID]
  324. [VC: One ID per Element Type]
  325. [VC: ID Attribute Default]
  326. | 'IDREF' [VC: IDREF]
  327. | 'IDREFS' [VC: IDREF]
  328. | 'ENTITY' [VC: Entity Name]
  329. | 'ENTITIES' [VC: Entity Name]
  330. [22] prolog ::= XMLDecl? Misc* (doctypedecl Misc*)? | 'NMTOKEN' [VC: Name Token]
  331. | 'EmptyElemTag
  332. | STag content ETag [WFC: Element Type Match]NMTOKENS' [VC: Name Token]
  333. Enumerated Attribute Types
  334. [57] EnumeratedType ::= NotationType | Enumeration
  335. [58] NotationType ::= 'NOTATION' S '(' S? Name (S? '|' S? Name)* S? ')' [VC: Notation Attributes]
  336. [VC: One Notation Per Element Type]
  337. [VC: No Notation on Empty Element]
  338. [VC: No Duplicate Tokens]
  339. [59] Enumeration ::= '(' S? Nmtoken (S? '|' S? Nmtoken)* S? ')' [VC: Enumeration]
  340. [VC: No Duplicate Tokens]
  341. [60] DefaultDecl ::= '#REQUIRED' | '#IMPLIED'
  342. | (('#FIXED' S)? AttValue) [VC: Required Attribute]
  343. [VC: Attribute Default Value Syntactically Correct]
  344. [WFC: No < in Attribute Values]
  345. [VC: Fixed Attribute Default]
  346. [WFC: No External Entity References]
  347. Conditional Section
  348. [61] conditionalSect ::= includeSect | ignoreSect
  349. [62] includeSect ::= '<![' S? 'INCLUDE' S? '[' extSubsetDecl ']]>' [VC: Proper Conditional Section/PE Nesting]
  350. [63] ignoreSect ::= '<![' S? 'IGNORE' S? '[' ignoreSectContents* ']]>' [VC: Proper Conditional Section/PE Nesting]
  351. [64] ignoreSectContents ::= Ignore ('<![' ignoreSectContents ']]>' Ignore)*
  352. [65] Ignore ::= Char* - (Char* ('<![' | ']]>') Char*)
  353. 4.1 Character and Entity References
  354. [66] CharRef ::= '&#' [0-9]+ ';'
  355. | '&#x' [0-9a-fA-F]+ ';' [WFC: Legal Character]
  356. Entity Reference
  357. [67] Reference ::= EntityRef | CharRef
  358. [68] EntityRef ::= '&' Name ';' [WFC: Entity Declared]
  359. [VC: Entity Declared]
  360. [WFC: Parsed Entity]
  361. [WFC: No Recursion]
  362. [69] PEReference ::= '%' Name ';' [VC: Entity Declared]
  363. [WFC: No Recursion]
  364. [WFC: In DTD]
  365. Entity Declaration
  366. [70] EntityDecl ::= GEDecl | PEDecl
  367. [71] GEDecl ::= '<!ENTITY' S Name S EntityDef S? '>'
  368. [72] PEDecl ::= '<!ENTITY' S '%' S Name S PEDef S? '>'
  369. [73] EntityDef ::= EntityValue | (ExternalID NDataDecl?)
  370. [74] PEDef ::= EntityValue | ExternalID
  371. External Entity Declaration
  372. [75] ExternalID ::= 'SYSTEM' S SystemLiteral
  373. | 'PUBLIC' S PubidLiteral S SystemLiteral
  374. [76] NDataDecl ::= S 'NDATA' S Name [VC: Notation Declared]
  375. Text Declaration
  376. [77] TextDecl ::= '<?xml' VersionInfo? EncodingDecl S? '?>'
  377. Well-Formed External Parsed Entity
  378. [78] extParsedEnt ::= TextDecl? content
  379. [80] EncodingDecl ::= S 'encoding' Eq ('"' EncName '"' | "'" EncName "'" )
  380. [81] EncName ::= [A-Za-z] ([A-Za-z0-9._] | '-')* ( Encoding name contains only Latin characters )
  381. 4.6 Predefined Entities
  382. <!ENTITY lt "&#38;#60;">
  383. <!ENTITY gt "&#62;">
  384. <!ENTITY amp "&#38;#38;">
  385. <!ENTITY apos "&#39;">
  386. <!ENTITY quot "&#34;">
  387. Notation Declarations
  388. [82] NotationDecl ::= '<!NOTATION' S Name S (ExternalID | PublicID) S? '>' [VC: Unique Notation Name]
  389. [83] PublicID ::= 'PUBLIC' S PubidLiteral
  390. */
  391. BxmlParser * bxmlparser_parse_comment(BxmlParser * me) {
  392. bxmlparser_skip_prefix(me, "<!--");
  393. if (!bxmlparser_skip_suffix(me, "-->")) {
  394. return bxmlparser_set_error(me, "cannot find closing -->");
  395. }
  396. return me;
  397. }
  398. BxmlParser * bxmlparser_parse_cdata(BxmlParser * me) {
  399. Bxml * last;
  400. char * start;
  401. int size;
  402. bxmlparser_skip_prefix(me, "<![CDATA[");
  403. start = bxmlparser_index(me);
  404. size = bxmlparser_find_suffix(me, "]]>");
  405. if (size < 0) {
  406. return bxmlparser_set_error(me, "cannot find end of CDATA ]]");
  407. }
  408. last = bxml_get_last_child(me->tag);
  409. /* compact text if possible */
  410. if (last && (last->kind == BXML_TEXT)) {
  411. bxml_append_buf(last, start, size);
  412. } else {
  413. bxmlparser_add_tag_size(me, BXML_TEXT, size, start);
  414. }
  415. bxmlparser_skip(me, size + 3);
  416. return me;
  417. }
  418. BxmlParser * bxmlparser_parse_xmldecl(BxmlParser * me) {
  419. if (!bxmlparser_skip_suffix(me, "?>")) {
  420. return bxmlparser_set_error(me, "cannot find closing ?> of xml declaration tag");
  421. }
  422. /* TODO: implement this. */
  423. /* Skip whitespace to make sure following tags are found. */
  424. bxmlparser_skip_ws(me);
  425. return me;
  426. }
  427. BxmlParser * bxmlparser_parse_processing(BxmlParser * me) {
  428. if (!bxmlparser_skip_suffix(me, "?>")) {
  429. return bxmlparser_set_error(me,"cannot find closing ?>");
  430. }
  431. /* TODO: implement this. */
  432. return me;
  433. }
  434. /* Parses a doctype declaration.
  435. * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S? ('[' intSubset ']' S?)? '>'
  436. */
  437. BxmlParser * bxmlparser_parse_doctype(BxmlParser * me) {
  438. if(!bxmlparser_skip_prefix(me, "<!DOCTYPE")) return NULL;
  439. if (!bxmlparser_skip_suffix(me, ">")) {
  440. return bxmlparser_set_error(me, "cannot find closing > for doctype");
  441. }
  442. /* TODO: implement this. */
  443. return me;
  444. }
  445. BxmlParser * bxmlparser_parse_entity(BxmlParser * me) {
  446. if(!bxmlparser_skip_prefix(me, "<!ENTITY")) return NULL;
  447. if (!bxmlparser_skip_suffix(me, ">")) {
  448. return bxmlparser_set_error(me,"cannot find closing > for entity");
  449. }
  450. /* TODO: implement this. */
  451. return me;
  452. }
  453. BxmlParser * bxmlparser_parse_attlist_tag(BxmlParser * me) {
  454. if(!bxmlparser_skip_prefix(me, "<!ATTLIST")) return NULL;
  455. if (!bxmlparser_skip_suffix(me, ">")) {
  456. return bxmlparser_set_error(me, "cannot find closing > for attlist");
  457. }
  458. /* TODO: implement this. */
  459. return me;
  460. }
  461. int bxmlparser_have_comment(BxmlParser * me) {
  462. return (bxmlparser_have_prefix(me, "<!--"));
  463. }
  464. int bxmlparser_have_end_tag(BxmlParser * me) {
  465. return (bxmlparser_have_prefix(me, "</"));
  466. }
  467. int bxmlparser_have_processing(BxmlParser * me) {
  468. return (bxmlparser_have_prefix(me, "<?"));
  469. }
  470. int bxmlparser_have_reference(BxmlParser * me) {
  471. return (bxmlparser_have_prefix(me, "&"));
  472. }
  473. int bxmlparser_have_bang(BxmlParser * me) {
  474. return (bxmlparser_have_prefix(me, "<!"));
  475. }
  476. int bxmlparser_have_cdata(BxmlParser * me) {
  477. return (bxmlparser_have_prefix(me, "<![CDATA["));
  478. }
  479. int bxmlparser_have_element(BxmlParser * me) {
  480. if (bxmlparser_have_processing(me)) return 0;
  481. if (bxmlparser_have_bang(me)) return 0;
  482. if (bxmlparser_have_end_tag(me)) return 0;
  483. return bxmlparser_have_prefix(me, "<");
  484. }
  485. int bxmlparser_have_chardata(BxmlParser * me) {
  486. if (bxmlparser_have_prefix(me, "<")) return 0;
  487. if (bxmlparser_have_prefix(me, "&")) return 0;
  488. return !bxmlparser_is_end(me);
  489. }
  490. /*
  491. * [4] NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF]
  492. * [4a] NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040]
  493. * [5] Name ::= NameStartChar (NameChar)*
  494. *
  495. * However I won't support non ASCII names of now
  496. * If ok Name is set to point into the parer's buffer and should not be freed.
  497. */
  498. BxmlParser * bxmlparser_parse_name(BxmlParser * me, char ** name, int * size) {
  499. int size1, size2;
  500. char * text;
  501. text = bxmlparser_index(me);
  502. size1 = bxmlparser_skip_in(me, BXML_ALPHA ":_");
  503. if (size1 < 0) return bxmlparser_set_error(me, "unexpected character in name");
  504. if (size1 < 1) return bxmlparser_set_error(me, "name too short");
  505. size2 = bxmlparser_skip_in(me, BXML_ALNUM ":_-.");
  506. if (size2 < 0) return bxmlparser_set_error(me, "unexpected character in rest of name");
  507. (*name) = text;
  508. (*size) = size1 + size2;
  509. return me;
  510. }
  511. int bxmlparser_encode_utf8(long lchar, char * buf) {
  512. int bits;
  513. int bytes;
  514. int result;
  515. if (lchar < 0x80) { /* ASCII */
  516. buf[0] = (char) lchar;
  517. result = 1;
  518. } else {
  519. /* Bit and byte count */
  520. for (bits = 0; lchar; lchar /=2) bits++;
  521. bytes = (bits - 2) / 5;
  522. result = bytes;
  523. /* First header byte. */
  524. (*buf) = (0xFF << (7 - bytes)) | (lchar >> (6 * bytes));
  525. buf++;
  526. /* Data bytes. */
  527. while (bytes) {
  528. (*buf) = 0x80 | ( lchar >> (6 * bytes));
  529. buf++;
  530. bytes--;
  531. }
  532. }
  533. return result;
  534. }
  535. /* Use as table to look up the character of the given entities. */
  536. static char * bxmlparser_entity_lut[6][2] = {
  537. { "&" , "amp" },
  538. { "'" , "apos" },
  539. { ">" , "gt" },
  540. { "<" , "lt" },
  541. { "\"" , "quot" },
  542. { NULL , NULL }
  543. };
  544. char * bxmlparser_lookup_entity(BxmlParser * me, char * name, size_t size) {
  545. int index;
  546. for (index = 0; bxmlparser_entity_lut[index][0]; index ++) {
  547. if (0 == strncmp(bxmlparser_entity_lut[index][1], name, size)) {
  548. return bxmlparser_entity_lut[index][0];
  549. }
  550. }
  551. (void) me;
  552. return NULL;
  553. }
  554. static char bxmlparser_reference_buffer[12];
  555. /* Parses an entity reference into a string. Returns a reference to a const or
  556. * static string buffer and a size. */
  557. BxmlParser * bxmlparser_parse_reference_to_string(BxmlParser * me, char ** str_out, int * size_out) {
  558. int numerical;
  559. int size;
  560. long lchar;
  561. char * text;
  562. if (!bxmlparser_skip_prefix(me, "&")) return bxmlparser_set_error(me, "starting & of reference not found");
  563. numerical = bxmlparser_skip_prefix(me, "#");
  564. text = bxmlparser_index(me);
  565. size = bxmlparser_find_suffix(me, ";");
  566. if (size < 1) return bxmlparser_set_error(me, "empty reference");
  567. bxmlparser_skip_suffix(me, ";");
  568. if (numerical) {
  569. int base = 10;
  570. int bytes;
  571. if ((text[0] == 'x') || (text[0] == 'X')) {
  572. base = 16;
  573. text++;
  574. size--;
  575. }
  576. errno = 0;
  577. lchar = strtol(text, NULL, base);
  578. if (errno)
  579. if (size > (int)(sizeof(bxmlparser_reference_buffer) - 1)) {
  580. size = (sizeof(bxmlparser_reference_buffer) - 1);
  581. }
  582. bytes = bxmlparser_encode_utf8(lchar, bxmlparser_reference_buffer);
  583. (*str_out) = bxmlparser_reference_buffer;
  584. (*size_out) = bytes;
  585. } else {
  586. char * replace = bxmlparser_lookup_entity(me, text, size);
  587. if (!replace) return bxmlparser_set_error(me, "Unknown entity reference!");
  588. (*str_out) = replace;
  589. (*size_out) = strlen(replace);
  590. }
  591. return me;
  592. }
  593. /* Parses an entity reference. */
  594. BxmlParser * bxmlparser_parse_reference(BxmlParser * me) {
  595. char * str;
  596. int size;
  597. Bxml * last;
  598. if (!bxmlparser_parse_reference_to_string(me, &str, &size)) {
  599. return NULL;
  600. }
  601. last = bxml_get_last_child(me->tag);
  602. /* compact text if possible */
  603. if (last && (last->kind == BXML_TEXT)) {
  604. bxml_append_buf(last, str, size);
  605. } else {
  606. bxmlparser_add_tag_size(me, BXML_TEXT, size, str);
  607. }
  608. return me;
  609. }
  610. /* [10] AttValue ::= '"' ([^<&"] | Reference)* '"'
  611. | "'" ([^<&'] | Reference)* "'"
  612. */
  613. BxmlParser * bxmlparser_parse_attribute_value(BxmlParser * me, Swis * buffer) {
  614. char * start;
  615. char * ref;
  616. int size;
  617. int ch;
  618. if (!bxmlparser_skip_prefix(me, "\""))
  619. return bxmlparser_set_error(me, "cannot find start of attribute value");
  620. ch = bxmlparser_now(me);
  621. while (ch != '"') {
  622. if (ch == '&') {
  623. if(!bxmlparser_parse_reference_to_string(me, &ref, &size))
  624. return NULL;
  625. swis_append_buf(buffer, ref, size);
  626. ch = bxmlparser_now(me);
  627. } else {
  628. swis_append_char(buffer, ch);
  629. ch = bxmlparser_next(me);
  630. }
  631. }
  632. if (!bxmlparser_skip_prefix(me, "\""))
  633. return bxmlparser_set_error(me, "cannot find end of attribute value");
  634. return me;
  635. }
  636. /* [41] Attribute ::= Name Eq AttValue */
  637. BxmlParser * bxmlparser_parse_attribute(BxmlParser * me, Bxml * tag) {
  638. char * name = NULL;
  639. int namesize;
  640. Swis value;
  641. bxmlparser_skip_ws(me);
  642. if (!bxmlparser_parse_name(me, &name, &namesize)) {
  643. return bxmlparser_set_error(me, "Cannot parse name in attribute.");
  644. }
  645. bxmlparser_skip_ws(me);
  646. if (!bxmlparser_skip_prefix(me, "=")) {
  647. return bxmlparser_set_error(me, "Cannot find = in attribute.");
  648. }
  649. bxmlparser_skip_ws(me);
  650. if (!swis_new_empty(&value))
  651. return bxmlparser_set_error(me, "out of memory in attribute value parsing");
  652. if (!bxmlparser_parse_attribute_value(me, &value)) {
  653. swis_free(&value);
  654. return bxmlparser_set_error(me, "Cannot parse attribute value");
  655. }
  656. bxml_new_attribute_size(tag, name, namesize, value.text, value.size);
  657. swis_free(&value);
  658. return me;
  659. }
  660. BxmlParser * bxmlparser_parse_attribute_list(BxmlParser * me, Bxml * tag) {
  661. bxmlparser_skip_ws(me);
  662. while (!bxmlparser_now_in(me, "/>")) {
  663. if (!bxmlparser_parse_attribute(me, tag)) return NULL;
  664. bxmlparser_skip_ws(me);
  665. }
  666. return me;
  667. }
  668. /* Parses in-between character data that is not in a CDATA section. */
  669. BxmlParser * bxmlparser_parse_chardata(BxmlParser * me) {
  670. Bxml * last;
  671. int size;
  672. char * text;
  673. text = bxmlparser_index(me);
  674. size = bxmlparser_skip_not_in(me, "<&");
  675. if (size < 0) {
  676. return bxmlparser_set_error(me, "trouble in character data");
  677. }
  678. last = bxml_get_last_child(me->tag);
  679. /* compact text if possible */
  680. if (last && (last->kind == BXML_TEXT)) {
  681. bxml_append_buf(last, text, size);
  682. } else {
  683. if (!bxmlparser_add_tag_size(me, BXML_TEXT, size, text)) {
  684. return bxmlparser_set_error(me, "could not allocate character tag");
  685. }
  686. }
  687. return me;
  688. }
  689. /* Forward declaration. */
  690. BxmlParser * bxmlparser_parse_element(BxmlParser * me);
  691. /* Parses the contents of a non-empty tag.
  692. * [43] content ::= CharData? ((element | Reference | CDSect | PI | Comment) CharData?)*
  693. */
  694. BxmlParser * bxmlparser_parse_contents(BxmlParser * me) {
  695. Bxml * tagi = NULL;
  696. int done = 0;
  697. if (bxmlparser_have_chardata(me)) {
  698. if (!bxmlparser_parse_chardata(me)) return NULL;
  699. }
  700. while (!done) {
  701. Bxml * now = me->tag;
  702. if (bxmlparser_have_reference(me)) {
  703. if (!bxmlparser_parse_reference(me)) return NULL;
  704. } else if (bxmlparser_have_cdata(me)) {
  705. if (!bxmlparser_parse_cdata(me)) return NULL;
  706. } else if (bxmlparser_have_processing(me)) {
  707. if (!bxmlparser_parse_processing(me)) return NULL;
  708. } else if (bxmlparser_have_comment(me)) {
  709. if (!bxmlparser_parse_comment(me)) return NULL;
  710. } if (bxmlparser_have_element(me)) {
  711. if (!bxmlparser_parse_element(me)) return NULL;
  712. } else {
  713. done = 1;
  714. }
  715. /* Restore current tag from the ravages of recursion. */
  716. me->tag = now;
  717. /* Try to parse any other chardata if needed. */
  718. if (bxmlparser_have_chardata(me)) {
  719. if (!bxmlparser_parse_chardata(me)) return NULL;
  720. }
  721. }
  722. return me;
  723. }
  724. /* Parses the opening <, the name and the attributes of a tag. */
  725. BxmlParser * bxmlparser_parse_tag_head(BxmlParser * me) {
  726. char * name;
  727. int size;
  728. Bxml * tag;
  729. if (!bxmlparser_skip_prefix(me, "<")) {
  730. return bxmlparser_set_error(me, "Cannot find opening < of tag.");
  731. }
  732. if (!bxmlparser_parse_name(me, &name, &size)) {
  733. return bxmlparser_set_error(me, "Cannot parse name of tag.");
  734. }
  735. tag = bxmlparser_add_tag_size(me, BXML_TAG, size, name);
  736. me->tag = tag;
  737. if (!bxmlparser_parse_attribute_list(me, tag)) {
  738. return bxmlparser_set_error(me, "Cannot parse attribute list of tag.");
  739. }
  740. bxmlparser_skip_ws(me);
  741. return me;
  742. }
  743. /* STag content ETag */
  744. BxmlParser * bxmlparser_parse_open_close_tag(BxmlParser * me) {
  745. char * name;
  746. int size;
  747. /* start tag */
  748. if (!bxmlparser_skip_prefix(me, ">")) {
  749. return bxmlparser_set_error(me, "Cannot find closing > of tag.");
  750. }
  751. /* contents of the tag */
  752. if (!bxmlparser_parse_contents(me)) return NULL;
  753. /* end tag */
  754. if (!bxmlparser_skip_prefix(me, "</")) return NULL;
  755. if (!bxmlparser_parse_name(me, &name, &size)) {
  756. return bxmlparser_set_error(me, "Cannot parse name of end of tag.");
  757. }
  758. bxmlparser_skip_ws(me);
  759. if (!bxmlparser_skip_prefix(me, ">")) {
  760. return bxmlparser_set_error(me, "Cannot find closing > of end tag");
  761. }
  762. if (strncmp(me->tag->name, name, (size_t)size) != 0) {
  763. return bxmlparser_set_error(me, "Name of end tag does not match begin tag.");
  764. }
  765. return me;
  766. }
  767. /*
  768. [44] EmptyTag ::= '<' Name (S Attribute)* S? '/>' [WFC: Unique Att Spec]
  769. */
  770. BxmlParser * bxmlparser_parse_empty_tag(BxmlParser * me) {
  771. if (!bxmlparser_skip_prefix(me, "/>")) return NULL;
  772. return me;
  773. }
  774. /* element := EmptyElemTag | STag content ETag */
  775. BxmlParser * bxmlparser_parse_element(BxmlParser * me) {
  776. int now;
  777. now = bxmlparser_now(me);
  778. if (now != '<') {
  779. return bxmlparser_set_error(me, "cannot find beginning of tag");
  780. }
  781. /* Parse the head of the tag first. */
  782. if (!bxmlparser_parse_tag_head(me)) return NULL;
  783. /* now check for a close tag. */
  784. bxmlparser_skip_ws(me);
  785. /* If found, it's an empty tag, otherwise try a normal tag. */
  786. if (bxmlparser_have_prefix(me, "/>")) {
  787. return bxmlparser_parse_empty_tag(me);
  788. } else {
  789. return bxmlparser_parse_open_close_tag(me);
  790. }
  791. }
  792. /**
  793. * Checks if a misc tag seems to be ready.
  794. */
  795. int bxmlparser_have_misc(BxmlParser * me) {
  796. if (bxmlparser_have_prefix(me, "<!--")) return 1;
  797. if (bxmlparser_have_prefix(me, "<?")) return 2;
  798. if (bxmlparser_now_ws(me)) return 3;
  799. return 0;
  800. }
  801. /*
  802. * Parses a Misc part of XML as per the rule:
  803. [27] Misc ::= Comment | PI | S
  804. */
  805. BxmlParser * bxmlparser_parse_misc(BxmlParser * me) {
  806. if (bxmlparser_have_comment(me)) return bxmlparser_parse_comment(me);
  807. if (bxmlparser_have_processing(me)) return bxmlparser_parse_processing(me);
  808. if (bxmlparser_now_ws(me)) {
  809. bxmlparser_skip_ws(me);
  810. return me;
  811. }
  812. return NULL;
  813. }
  814. /* Parses Misc* lists */
  815. BxmlParser * bxmlparser_parse_misc_list(BxmlParser * me) {
  816. BxmlParser * ok;
  817. while (bxmlparser_have_misc(me)) {
  818. ok = bxmlparser_parse_misc(me);
  819. if (!ok) return NULL;
  820. }
  821. return me;
  822. }
  823. /*
  824. * Parses a prolog as per rule 22:
  825. * [22] prolog ::= XMLDecl? Misc* (doctypedecl Misc*)?
  826. */
  827. BxmlParser * bxmlparser_parse_prolog(BxmlParser * me) {
  828. BxmlParser * ok;
  829. if (bxmlparser_have_prefix(me, "<?xml")) {
  830. ok = bxmlparser_parse_xmldecl(me);
  831. if (!ok) return NULL;
  832. }
  833. ok = bxmlparser_parse_misc_list(me);
  834. if (!ok) return NULL;
  835. if (bxmlparser_have_prefix(me, "<!DOCTYPE")) {
  836. ok = bxmlparser_parse_doctype(me);
  837. if (!ok) return NULL;
  838. ok = bxmlparser_parse_misc_list(me);
  839. if (!ok) return NULL;
  840. }
  841. return ok;
  842. }
  843. /*
  844. * Parses a document as per rule [1].
  845. * document ::= prolog element Misc*
  846. */
  847. BxmlParser * bxmlparser_parse_document(BxmlParser * me) {
  848. BxmlParser * ok = bxmlparser_parse_prolog(me);
  849. if (!ok) return NULL;
  850. ok = bxmlparser_parse_element(me);
  851. if (!ok) return NULL;
  852. ok = bxmlparser_parse_misc_list(me);
  853. return ok;
  854. }
  855. /* Initiates the parse. */
  856. BxmlParser * bxmlparser_parse(BxmlParser * me) {
  857. bxmlparser_skip_ws(me);
  858. return bxmlparser_parse_document(me);
  859. }
  860. Bxml * bxmlparser_parse_buf(BxmlParser * me, char * buf, int size) {
  861. Bxml * result = NULL;
  862. if ((!me) || (!buf)) {
  863. return NULL;
  864. }
  865. swis_init_buf(&(me->buffer), buf, size);
  866. bxmlparser_parse(me);
  867. result = me->root;
  868. return result;
  869. }
  870. Bxml * bxmlparser_parse_str(BxmlParser * me, char * buf) {
  871. return bxmlparser_parse_buf(me, buf, strlen(buf));
  872. }
  873. /** Reads in a file into the parser's buffer. */
  874. BxmlParser * bxmlparser_read_file(BxmlParser * me, FILE * file) {
  875. if (!swis_read_file(&me->buffer, file)) return NULL;
  876. me->index = me->buffer.text;
  877. return me;
  878. }
  879. Bxml * bxmlparser_parse_file(BxmlParser * me, FILE * file) {
  880. Bxml * result = NULL;
  881. if (!file) {
  882. return NULL;
  883. }
  884. if (bxmlparser_read_file(me, file)) {
  885. bxmlparser_parse(me);
  886. result = me->root;
  887. }
  888. return result;
  889. }
  890. Bxml * bxmlparser_parse_filename(BxmlParser * me, char * filename) {
  891. Bxml * result;
  892. FILE * file;
  893. file = fopen(filename, "r");
  894. result = bxmlparser_parse_file(me, file);
  895. fclose(file);
  896. return result;
  897. }
  898. const char * bxmlparser_get_error(BxmlParser * me) {
  899. return me->error;
  900. }
  901. int bxmlparser_get_line(BxmlParser * me) {
  902. return me->line;
  903. }
  904. int bxmlparser_get_column(BxmlParser * me) {
  905. return me->col;
  906. }