daplex.c 8.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350
  1. /* Copyright 2009, UCAR/Unidata and OPeNDAP, Inc.
  2. See the COPYRIGHT file for more information. */
  3. #include "config.h"
  4. #include <strings.h>
  5. #include "dapparselex.h"
  6. #undef URLCVT /* NEVER turn this on */
  7. #define DAP2ENCODE
  8. /* Forward */
  9. static void dumptoken(DAPlexstate* lexstate);
  10. static void dapaddyytext(DAPlexstate* lex, int c);
  11. #ifndef DAP2ENCODE
  12. static int tohex(int c);
  13. #endif
  14. /****************************************************/
  15. #if 0 /* Following definitions are for informational purposes */
  16. /* Set of all ascii printable characters */
  17. static char ascii[] = " !\"#$%&'()*+,-./:;<=>?@[]\\^_`|{}~";
  18. /* Define the set of legal nonalphanum characters as specified in the DAP2 spec. */
  19. static char* daplegal ="_!~*'-\"";
  20. #endif
  21. static char* ddsworddelims =
  22. "{}[]:;=,";
  23. /* Define 1 and > 1st legal characters */
  24. static char* ddswordchars1 =
  25. "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-+_/%\\.*";
  26. static char* ddswordcharsn =
  27. "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-+_/%\\.*#";
  28. static char* daswordcharsn =
  29. "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-+_/%\\.*#:";
  30. static char* cewordchars1 =
  31. "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-+_/%\\";
  32. static char* cewordcharsn =
  33. "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-+_/%\\";
  34. /* Current sets of legal characters */
  35. /*
  36. static char* wordchars1 = NULL;
  37. static char* wordcharsn = NULL;
  38. static char* worddelims = NULL;
  39. */
  40. static char* keywords[] = {
  41. "alias",
  42. "array",
  43. "attributes",
  44. "byte",
  45. "dataset",
  46. "error",
  47. "float32",
  48. "float64",
  49. "grid",
  50. "int16",
  51. "int32",
  52. "maps",
  53. "sequence",
  54. "string",
  55. "structure",
  56. "uint16",
  57. "uint32",
  58. "url",
  59. "code",
  60. "message",
  61. "program_type",
  62. "program",
  63. NULL /* mark end of the keywords list */
  64. };
  65. static int keytokens[] = {
  66. SCAN_ALIAS,
  67. SCAN_ARRAY,
  68. SCAN_ATTR,
  69. SCAN_BYTE,
  70. SCAN_DATASET,
  71. SCAN_ERROR,
  72. SCAN_FLOAT32,
  73. SCAN_FLOAT64,
  74. SCAN_GRID,
  75. SCAN_INT16,
  76. SCAN_INT32,
  77. SCAN_MAPS,
  78. SCAN_SEQUENCE,
  79. SCAN_STRING,
  80. SCAN_STRUCTURE,
  81. SCAN_UINT16,
  82. SCAN_UINT32,
  83. SCAN_URL,
  84. SCAN_CODE,
  85. SCAN_MESSAGE,
  86. SCAN_PTYPE,
  87. SCAN_PROG
  88. };
  89. /**************************************************/
  90. int
  91. daplex(YYSTYPE* lvalp, DAPparsestate* state)
  92. {
  93. DAPlexstate* lexstate = state->lexstate;
  94. int token;
  95. int c;
  96. unsigned int i;
  97. char* p;
  98. char* tmp;
  99. token = 0;
  100. ocbytesclear(lexstate->yytext);
  101. /* invariant: p always points to current char */
  102. for(p=lexstate->next;token==0&&(c=*p);p++) {
  103. if(c == '\n') {
  104. lexstate->lineno++;
  105. } else if(c <= ' ' || c == '\177') {
  106. /* whitespace: ignore */
  107. } else if(c == '#') {
  108. /* single line comment */
  109. while((c=*(++p))) {if(c == '\n') break;}
  110. } else if(strchr(lexstate->worddelims,c) != NULL) {
  111. /* don't put in lexstate->yytext to avoid memory leak */
  112. token = c;
  113. } else if(c == '"') {
  114. int more = 1;
  115. /* We have a string token; will be reported as WORD_STRING */
  116. while(more && (c=*(++p))) {
  117. #ifdef DAP2ENCODE
  118. if(c == '"')
  119. more = 0;
  120. else if(c == '\\') {
  121. /* Remove spec ambiguity by convering \c to c
  122. for any character c */
  123. c=*(++p);
  124. if(c == '\0') more = 0;
  125. }
  126. #else /*Non-standard*/
  127. switch (c) {
  128. case '"': more=0; break;
  129. case '\\':
  130. c=*(++p);
  131. switch (c) {
  132. case 'r': c = '\r'; break;
  133. case 'n': c = '\n'; break;
  134. case 'f': c = '\f'; break;
  135. case 't': c = '\t'; break;
  136. case 'x': {
  137. int d1,d2;
  138. c = '?';
  139. ++p;
  140. d1 = tohex(*p++);
  141. if(d1 < 0) {
  142. daperror(state,"Illegal \\xDD in TOKEN_STRING");
  143. } else {
  144. d2 = tohex(*p++);
  145. if(d2 < 0) {
  146. daperror(state,"Illegal \\xDD in TOKEN_STRING");
  147. } else {
  148. c=(((unsigned int)d1)<<4) | (unsigned int)d2;
  149. }
  150. }
  151. } break;
  152. default: break;
  153. }
  154. break;
  155. default: break;
  156. }
  157. #endif /*!DAP2ENCODE*/
  158. if(more) dapaddyytext(lexstate,c);
  159. }
  160. token=WORD_STRING;
  161. } else if(strchr(lexstate->wordchars1,c) != NULL) {
  162. int isdatamark = 0;
  163. /* we have a WORD_WORD */
  164. dapaddyytext(lexstate,c);
  165. while((c=*(++p))) {
  166. #ifdef URLCVT
  167. if(c == '%' && p[1] != 0 && p[2] != 0
  168. && strchr(hexdigits,p[1]) != NULL
  169. && strchr(hexdigits,p[2]) != NULL) {
  170. int d1,d2;
  171. d1 = tohex(p[1]);
  172. d2 = tohex(p[2]);
  173. if(d1 >= 0 || d2 >= 0) {
  174. c=(((unsigned int)d1)<<4) | (unsigned int)d2;
  175. p+=2;
  176. }
  177. } else {
  178. if(strchr(lexstate->wordcharsn,c) == NULL) {p--; break;}
  179. }
  180. dapaddyytext(lexstate,c);
  181. #else
  182. if(strchr(lexstate->wordcharsn,c) == NULL) {p--; break;}
  183. dapaddyytext(lexstate,c);
  184. #endif
  185. }
  186. /* Special check for Data: */
  187. tmp = ocbytescontents(lexstate->yytext);
  188. if(strcmp(tmp,"Data")==0 && *p == ':') {
  189. dapaddyytext(lexstate,*p); p++;
  190. if(p[0] == '\n') {
  191. token = SCAN_DATA;
  192. isdatamark = 1;
  193. p++;
  194. } else if(p[0] == '\r' && p[1] == '\n') {
  195. token = SCAN_DATA;
  196. isdatamark = 1;
  197. p+=2;
  198. }
  199. }
  200. if(!isdatamark) {
  201. /* check for keyword */
  202. token=WORD_WORD; /* assume */
  203. for(i=0;;i++) {
  204. if(keywords[i] == NULL) break;
  205. if(strcasecmp(keywords[i],tmp)==0) {
  206. token=keytokens[i];
  207. break;
  208. }
  209. }
  210. }
  211. } else { /* illegal */
  212. }
  213. }
  214. lexstate->next = p;
  215. strncpy(lexstate->lasttokentext,ocbytescontents(lexstate->yytext),MAX_TOKEN_LENGTH);
  216. lexstate->lasttoken = token;
  217. if(ocdebug >= 2)
  218. dumptoken(lexstate);
  219. /*Put return value onto Bison stack*/
  220. if(ocbyteslength(lexstate->yytext) == 0)
  221. *lvalp = NULL;
  222. else {
  223. *lvalp = ocbytesdup(lexstate->yytext);
  224. oclistpush(lexstate->reclaim,(ocelem)*lvalp);
  225. }
  226. return token; /* Return the type of the token. */
  227. }
  228. static void
  229. dapaddyytext(DAPlexstate* lex, int c)
  230. {
  231. ocbytesappend(lex->yytext,(char)c);
  232. }
  233. #ifndef DAP2ENCODE
  234. static int
  235. tohex(int c)
  236. {
  237. if(c >= 'a' && c <= 'f') return (c - 'a') + 0xa;
  238. if(c >= 'A' && c <= 'F') return (c - 'A') + 0xa;
  239. if(c >= '0' && c <= '9') return (c - '0');
  240. return -1;
  241. }
  242. #endif
  243. static void
  244. dumptoken(DAPlexstate* lexstate)
  245. {
  246. fprintf(stderr,"TOKEN = |%s|\n",ocbytescontents(lexstate->yytext));
  247. }
  248. /*
  249. Simple lexer
  250. */
  251. void
  252. dapsetwordchars(DAPlexstate* lexstate, int kind)
  253. {
  254. switch (kind) {
  255. case 0:
  256. lexstate->worddelims = ddsworddelims;
  257. lexstate->wordchars1 = ddswordchars1;
  258. lexstate->wordcharsn = ddswordcharsn;
  259. break;
  260. case 1:
  261. lexstate->worddelims = ddsworddelims;
  262. lexstate->wordchars1 = ddswordchars1;
  263. lexstate->wordcharsn = daswordcharsn;
  264. break;
  265. case 2:
  266. lexstate->worddelims = ddsworddelims;
  267. lexstate->wordchars1 = cewordchars1;
  268. lexstate->wordcharsn = cewordcharsn;
  269. break;
  270. default: break;
  271. }
  272. }
  273. void
  274. daplexinit(char* input, DAPlexstate** lexstatep)
  275. {
  276. DAPlexstate* lexstate = (DAPlexstate*)malloc(sizeof(DAPlexstate));
  277. if(lexstatep) *lexstatep = lexstate;
  278. if(lexstate == NULL) return;
  279. memset((void*)lexstate,0,sizeof(DAPlexstate));
  280. lexstate->input = strdup(input);
  281. lexstate->next = lexstate->input;
  282. lexstate->yytext = ocbytesnew();
  283. lexstate->reclaim = oclistnew();
  284. dapsetwordchars(lexstate,0); /* Assume DDS */
  285. }
  286. void
  287. daplexcleanup(DAPlexstate** lexstatep)
  288. {
  289. DAPlexstate* lexstate = *lexstatep;
  290. if(lexstate == NULL) return;
  291. if(lexstate->input != NULL) ocfree(lexstate->input);
  292. if(lexstate->reclaim != NULL) {
  293. while(oclistlength(lexstate->reclaim) > 0) {
  294. char* word = (char*)oclistpop(lexstate->reclaim);
  295. if(word) free(word);
  296. }
  297. oclistfree(lexstate->reclaim);
  298. }
  299. ocbytesfree(lexstate->yytext);
  300. free(lexstate);
  301. *lexstatep = NULL;
  302. }
  303. /* Dap identifiers will come to us with some
  304. characters escaped using the URL notation of
  305. %HH. The assumption here is that any character
  306. that is encoded is left encoded, except as follows:
  307. 1. if the encoded character is in fact a legal DAP2 character
  308. (alphanum+"_!~*'-\"") then it is decoded, otherwise not.
  309. */
  310. #ifndef DECODE_IDENTIFIERS
  311. static char* decodelist =
  312. "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_!~*'-\"";
  313. #endif
  314. char*
  315. dapdecode(DAPlexstate* lexstate, char* name)
  316. {
  317. char* decoded;
  318. #ifdef DECODE_IDENTIFIERS
  319. decoded = ocuridecode(name);
  320. #else
  321. decoded = ocuridecodeonly(name,decodelist);
  322. #endif
  323. oclistpush(lexstate->reclaim,(ocelem)decoded);
  324. return decoded;
  325. }