parser-scanner.c
1 //! @file parser-scanner.c
2 //! @author J. Marcel van der Veer
3
4 //! @section Copyright
5 //!
6 //! This file is part of Algol68G - an Algol 68 compiler-interpreter.
7 //! Copyright 2001-2025 J. Marcel van der Veer [algol68g@xs4all.nl].
8
9 //! @section License
10 //!
11 //! This program is free software; you can redistribute it and/or modify it
12 //! under the terms of the GNU General Public License as published by the
13 //! Free Software Foundation; either version 3 of the License, or
14 //! (at your option) any later version.
15 //!
16 //! This program is distributed in the hope that it will be useful, but
17 //! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
18 //! or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
19 //! more details. You should have received a copy of the GNU General Public
20 //! License along with this program. If not, see [http://www.gnu.org/licenses/].
21
22 //! @section Synopsis
23 //!
24 //! Context-dependent Algol 68 tokeniser.
25
26 #include "a68g.h"
27 #include "a68g-parser.h"
28 #include "a68g-prelude.h"
29 #include "a68g-options.h"
30 #include "a68g-environ.h"
31
32 // Macros.
33
34 #define SCAN_DIGITS(c)\
35 while (IS_DIGIT (c)) {\
36 (sym++)[0] = (c);\
37 (c) = next_char (ref_l, ref_s, A68G_TRUE);\
38 }
39
40 #define SCAN_EXPONENT_PART(c)\
41 (sym++)[0] = EXPONENT_CHAR;\
42 (c) = next_char (ref_l, ref_s, A68G_TRUE);\
43 if ((c) == '+' || (c) == '-') {\
44 (sym++)[0] = (c);\
45 (c) = next_char (ref_l, ref_s, A68G_TRUE);\
46 }\
47 SCAN_ERROR (!IS_DIGIT (c), *start_l, *start_c, ERROR_EXPONENT_DIGIT);\
48 SCAN_DIGITS (c)
49
50 //! @brief Save scanner state, for character look-ahead.
51
52 void save_state (LINE_T * ref_l, char *ref_s, char ch)
53 {
54 SCAN_STATE_L (&A68G_JOB) = ref_l;
55 SCAN_STATE_S (&A68G_JOB) = ref_s;
56 SCAN_STATE_C (&A68G_JOB) = ch;
57 }
58
59 //! @brief Restore scanner state, for character look-ahead.
60
61 void restore_state (LINE_T ** ref_l, char **ref_s, char *ch)
62 {
63 *ref_l = SCAN_STATE_L (&A68G_JOB);
64 *ref_s = SCAN_STATE_S (&A68G_JOB);
65 *ch = SCAN_STATE_C (&A68G_JOB);
66 }
67
68 //! @brief New_source_line.
69
70 LINE_T *new_source_line (void)
71 {
72 LINE_T *z = (LINE_T *) get_fixed_heap_space ((size_t) SIZE_ALIGNED (LINE_T));
73 MARKER (z)[0] = NULL_CHAR;
74 STRING (z) = NO_TEXT;
75 FILENAME (z) = NO_TEXT;
76 DIAGNOSTICS (z) = NO_DIAGNOSTIC;
77 NUMBER (z) = 0;
78 PRINT_STATUS (z) = 0;
79 LIST (z) = A68G_TRUE;
80 NEXT (z) = NO_LINE;
81 PREVIOUS (z) = NO_LINE;
82 return z;
83 }
84
85 //! @brief Append a source line to the internal source file.
86
87 void append_source_line (char *str, LINE_T ** ref_l, int *line_num, char *filename)
88 {
89 LINE_T *z = new_source_line ();
90 // Allow shell command in first line, f.i. "#!/usr/share/bin/a68g".
91 if (*line_num == 1) {
92 if (strlen (str) >= 2 && strncmp (str, "#!", strlen ("#!")) == 0) {
93 ABEND (strstr (str, "run-script") != NO_TEXT, ERROR_SHELL_SCRIPT, __func__);
94 (*line_num)++;
95 return;
96 }
97 }
98 // Link line into the chain.
99 STRING (z) = new_fixed_string (str);
100 FILENAME (z) = filename;
101 NUMBER (z) = (*line_num)++;
102 PRINT_STATUS (z) = NOT_PRINTED;
103 LIST (z) = A68G_TRUE;
104 DIAGNOSTICS (z) = NO_DIAGNOSTIC;
105 NEXT (z) = NO_LINE;
106 PREVIOUS (z) = *ref_l;
107 if (TOP_LINE (&A68G_JOB) == NO_LINE) {
108 TOP_LINE (&A68G_JOB) = z;
109 }
110 if (*ref_l != NO_LINE) {
111 NEXT (*ref_l) = z;
112 }
113 *ref_l = z;
114 }
115
116 // Scanner, tokenises the source code.
117
118 //! @brief Whether ch is unworthy.
119
120 void unworthy (LINE_T * u, char *v, char ch)
121 {
122 if (IS_PRINT (ch)) {
123 ASSERT (a68g_bufprt (A68G (edit_line), SNPRINTF_SIZE, "*%s", ERROR_UNWORTHY_CHARACTER) >= 0);
124 } else {
125 ASSERT (a68g_bufprt (A68G (edit_line), SNPRINTF_SIZE, "*%s %s", ERROR_UNWORTHY_CHARACTER, ctrl_char (ch)) >= 0);
126 }
127 scan_error (u, v, A68G (edit_line));
128 }
129
130 //! @brief Concatenate lines that terminate in '\' with next line.
131
132 void concatenate_lines (LINE_T * top)
133 {
134 LINE_T *q;
135 // Work from bottom backwards.
136 for (q = top; q != NO_LINE && NEXT (q) != NO_LINE; q = NEXT (q)) {
137 ;
138 }
139 for (; q != NO_LINE; BACKWARD (q)) {
140 char *z = STRING (q);
141 size_t len = strlen (z);
142 if (len >= 2 && z[len - 2] == BACKSLASH_CHAR && z[len - 1] == NEWLINE_CHAR && NEXT (q) != NO_LINE && STRING (NEXT (q)) != NO_TEXT) {
143 z[len - 2] = NULL_CHAR;
144 len += strlen (STRING (NEXT (q)));
145 z = (char *) get_fixed_heap_space ((size_t) (len + 1));
146 a68g_bufcpy (z, STRING (q), len + 1);
147 a68g_bufcat (z, STRING (NEXT (q)), len + 1);
148 STRING (NEXT (q))[0] = NULL_CHAR;
149 STRING (q) = z;
150 }
151 }
152 }
153
154 //! @brief Whether u is bold tag v, independent of stropping regime.
155
156 BOOL_T is_bold (char *u, char *v)
157 {
158 size_t len = strlen (v);
159 if (OPTION_STROPPING (&A68G_JOB) == QUOTE_STROPPING) {
160 if (u[0] == '\'') {
161 return (BOOL_T) (strncmp (++u, v, len) == 0 && u[len] == '\'');
162 } else {
163 return A68G_FALSE;
164 }
165 } else {
166 return (BOOL_T) (strncmp (u, v, len) == 0 && !IS_UPPER (u[len]));
167 }
168 }
169
170 //! @brief Skip string.
171
172 BOOL_T skip_string (LINE_T ** top, char **ch)
173 {
174 LINE_T *u = *top;
175 char *v = *ch;
176 v++;
177 while (u != NO_LINE) {
178 while (v[0] != NULL_CHAR) {
179 if (v[0] == QUOTE_CHAR && v[1] != QUOTE_CHAR) {
180 *top = u;
181 *ch = &v[1];
182 return A68G_TRUE;
183 } else if (v[0] == QUOTE_CHAR && v[1] == QUOTE_CHAR) {
184 v += 2;
185 } else {
186 v++;
187 }
188 }
189 FORWARD (u);
190 if (u != NO_LINE) {
191 v = &(STRING (u)[0]);
192 } else {
193 v = NO_TEXT;
194 }
195 }
196 return A68G_FALSE;
197 }
198
199 //! @brief Skip comment.
200
201 BOOL_T skip_comment (LINE_T ** top, char **ch, int delim)
202 {
203 LINE_T *u = *top;
204 char *v = *ch;
205 BOOL_T qstrop = OPTION_STROPPING (&A68G_JOB) == QUOTE_STROPPING;
206 v++;
207 while (u != NO_LINE) {
208 while (v[0] != NULL_CHAR) {
209 if (is_bold (v, "COMMENT") && delim == BOLD_COMMENT_SYMBOL) {
210 char *w = &v[strlen(qstrop ? "'COMMENT'" : "COMMENT")];
211 if (!IS_UPPER(w[0])) {
212 *top = u;
213 *ch = w;
214 return A68G_TRUE;
215 }
216 } else if (is_bold (v, "CO") && delim == STYLE_I_COMMENT_SYMBOL) {
217 char *w = &v[strlen(qstrop ? "'CO'" : "CO")];
218 if (!IS_UPPER(w[0])) {
219 *top = u;
220 *ch = w;
221 return A68G_TRUE;
222 }
223 } else if (v[0] == '#' && delim == STYLE_II_COMMENT_SYMBOL) {
224 *top = u;
225 *ch = &v[1];
226 return A68G_TRUE;
227 } else {
228 v++;
229 }
230 }
231 FORWARD (u);
232 if (u != NO_LINE) {
233 v = &(STRING (u)[0]);
234 } else {
235 v = NO_TEXT;
236 }
237 }
238 return A68G_FALSE;
239 }
240
241 //! @brief Skip rest of pragmat.
242
243 BOOL_T skip_pragmat (LINE_T ** top, char **ch, int delim, BOOL_T whitespace)
244 {
245 LINE_T *u = *top;
246 char *v = *ch;
247 BOOL_T qstrop = OPTION_STROPPING (&A68G_JOB) == QUOTE_STROPPING;
248 while (u != NO_LINE) {
249 while (v[0] != NULL_CHAR) {
250 if (is_bold (v, "PRAGMAT") && delim == BOLD_PRAGMAT_SYMBOL) {
251 char *w = &v[strlen(qstrop ? "'PRAGMAT'" : "PRAGMAT")];
252 if (!IS_UPPER(w[0])) {
253 *top = u;
254 *ch = w;
255 return A68G_TRUE;
256 }
257 } else if (is_bold (v, "PR") && delim == STYLE_I_PRAGMAT_SYMBOL) {
258 char *w = &v[strlen(qstrop ? "'PR'" : "PR")];
259 if (!IS_UPPER(w[0])) {
260 *top = u;
261 *ch = w;
262 return A68G_TRUE;
263 }
264 } else {
265 if (whitespace && !IS_SPACE (v[0]) && v[0] != NEWLINE_CHAR) {
266 scan_error (u, v, ERROR_PRAGMENT);
267 } else if (IS_UPPER (v[0])) {
268 // Skip a bold word as you may trigger on REPR, for instance ...
269 while (IS_UPPER (v[0])) {
270 v++;
271 }
272 } else {
273 v++;
274 }
275 }
276 }
277 FORWARD (u);
278 if (u != NO_LINE) {
279 v = &(STRING (u)[0]);
280 } else {
281 v = NO_TEXT;
282 }
283 }
284 return A68G_FALSE;
285 }
286
287 //! @brief Return pointer to next token within pragmat.
288
289 char *get_pragmat_item (LINE_T ** top, char **ch)
290 {
291 LINE_T *u = *top;
292 char *v = *ch;
293 while (u != NO_LINE) {
294 while (v[0] != NULL_CHAR) {
295 if (!IS_SPACE (v[0]) && v[0] != NEWLINE_CHAR) {
296 *top = u;
297 *ch = v;
298 return v;
299 } else {
300 v++;
301 }
302 }
303 FORWARD (u);
304 if (u != NO_LINE) {
305 v = &(STRING (u)[0]);
306 } else {
307 v = NO_TEXT;
308 }
309 }
310 return NO_TEXT;
311 }
312
313 //! @brief Case insensitive strncmp for at most the number of chars in 'v'.
314
315 int streq (char *u, char *v)
316 {
317 int diff;
318 for (diff = 0; diff == 0 && u[0] != NULL_CHAR && v[0] != NULL_CHAR; u++, v++) {
319 diff = ((int) TO_LOWER (u[0])) - ((int) TO_LOWER (v[0]));
320 }
321 return diff;
322 }
323
324 //! @brief Scan for next pragmat and yield first pragmat item.
325
326 char *next_preprocessor_item (LINE_T ** top, char **ch, int *delim)
327 {
328 LINE_T *u = *top;
329 char *v = *ch;
330 *delim = 0;
331 while (u != NO_LINE) {
332 while (v[0] != NULL_CHAR) {
333 LINE_T *start_l = u;
334 char *start_c = v;
335 // STRINGs must be skipped.
336 if (v[0] == QUOTE_CHAR) {
337 SCAN_ERROR (!skip_string (&u, &v), start_l, start_c, ERROR_UNTERMINATED_STRING);
338 }
339 // COMMENTS must be skipped.
340 else if (is_bold (v, "COMMENT")) {
341 SCAN_ERROR (!skip_comment (&u, &v, BOLD_COMMENT_SYMBOL), start_l, start_c, ERROR_UNTERMINATED_COMMENT);
342 } else if (is_bold (v, "CO")) {
343 SCAN_ERROR (!skip_comment (&u, &v, STYLE_I_COMMENT_SYMBOL), start_l, start_c, ERROR_UNTERMINATED_COMMENT);
344 } else if (v[0] == '#') {
345 SCAN_ERROR (!skip_comment (&u, &v, STYLE_II_COMMENT_SYMBOL), start_l, start_c, ERROR_UNTERMINATED_COMMENT);
346 } else if (is_bold (v, "PRAGMAT") || is_bold (v, "PR")) {
347 // We caught a PRAGMAT.
348 char *item;
349 if (is_bold (v, "PRAGMAT")) {
350 *delim = BOLD_PRAGMAT_SYMBOL;
351 v = &v[strlen ("PRAGMAT")];
352 } else if (is_bold (v, "PR")) {
353 *delim = STYLE_I_PRAGMAT_SYMBOL;
354 v = &v[strlen ("PR")];
355 }
356 item = get_pragmat_item (&u, &v);
357 SCAN_ERROR (item == NO_TEXT, start_l, start_c, ERROR_UNTERMINATED_PRAGMAT);
358 // Item "preprocessor" restarts preprocessing if it is off.
359 if (A68G_PARSER (no_preprocessing) && streq (item, "PREPROCESSOR") == 0) {
360 A68G_PARSER (no_preprocessing) = A68G_FALSE;
361 SCAN_ERROR (!skip_pragmat (&u, &v, *delim, A68G_TRUE), start_l, start_c, ERROR_UNTERMINATED_PRAGMAT);
362 }
363 // If preprocessing is switched off, we idle to closing bracket.
364 else if (A68G_PARSER (no_preprocessing)) {
365 SCAN_ERROR (!skip_pragmat (&u, &v, *delim, A68G_FALSE), start_l, start_c, ERROR_UNTERMINATED_PRAGMAT);
366 }
367 // Item "nopreprocessor" stops preprocessing if it is on.
368 if (streq (item, "NOPREPROCESSOR") == 0) {
369 A68G_PARSER (no_preprocessing) = A68G_TRUE;
370 SCAN_ERROR (!skip_pragmat (&u, &v, *delim, A68G_TRUE), start_l, start_c, ERROR_UNTERMINATED_PRAGMAT);
371 }
372 // Item "INCLUDE" includes a file.
373 else if (streq (item, "INCLUDE") == 0) {
374 *top = u;
375 *ch = v;
376 return item;
377 }
378 // Item "READ" includes a file.
379 else if (streq (item, "READ") == 0) {
380 *top = u;
381 *ch = v;
382 return item;
383 }
384 // Unrecognised item - probably options handled later by the tokeniser.
385 else {
386 SCAN_ERROR (!skip_pragmat (&u, &v, *delim, A68G_FALSE), start_l, start_c, ERROR_UNTERMINATED_PRAGMAT);
387 }
388 } else if (IS_UPPER (v[0])) {
389 // Skip a bold word as you may trigger on REPR, for instance ...
390 while (IS_UPPER (v[0])) {
391 v++;
392 }
393 } else {
394 v++;
395 }
396 }
397 FORWARD (u);
398 if (u != NO_LINE) {
399 v = &(STRING (u)[0]);
400 } else {
401 v = NO_TEXT;
402 }
403 }
404 *top = u;
405 *ch = v;
406 return NO_TEXT;
407 }
408
409 //! @brief Include files.
410
411 void include_files (LINE_T * top)
412 {
413 // include_files
414 //
415 // syntax: PR read "filename" PR
416 // PR include "filename" PR
417 //
418 // The file gets inserted before the line containing the pragmat. In this way
419 // correct line numbers are preserved which helps diagnostics. A file that has
420 // been included will not be included a second time - it will be ignored.
421 // A rigorous fail-safe, but there is no mechanism to prevent recursive includes
422 // in A68 source code. User reports do not indicate sophisticated use of INCLUDE,
423 // so this is fine for now.
424 // TODO - some day we might need `app', analogous to `cpp'.
425 BOOL_T make_pass = A68G_TRUE;
426 while (make_pass) {
427 LINE_T *s, *t, *u = top;
428 char *v = &(STRING (u)[0]);
429 make_pass = A68G_FALSE;
430 errno = 0;
431 while (u != NO_LINE) {
432 int pr_lim;
433 char *item = next_preprocessor_item (&u, &v, &pr_lim);
434 LINE_T *start_l = u;
435 char *start_c = v;
436 // Search for PR include "filename" PR.
437 if (item != NO_TEXT && (streq (item, "INCLUDE") == 0 || streq (item, "READ") == 0)) {
438 FILE_T fd;
439 int n, line_num, fsize, k, bytes_read;
440 char *fbuf, delim;
441 BUFFER fnb;
442 char *fn;
443 // Skip to filename.
444 while (IS_ALPHA (v[0])) {
445 v++;
446 }
447 while (IS_SPACE (v[0])) {
448 v++;
449 }
450 // Scan quoted filename.
451 SCAN_ERROR ((v[0] != QUOTE_CHAR && v[0] != '\''), start_l, start_c, ERROR_INCORRECT_FILENAME);
452 delim = (v++)[0];
453 n = 0;
454 fnb[0] = NULL_CHAR;
455 // Scan Algol 68 string (note: "" denotes a ", while in C it concatenates).
456 do {
457 SCAN_ERROR (EOL (v[0]), start_l, start_c, ERROR_INCORRECT_FILENAME);
458 SCAN_ERROR (n == BUFFER_SIZE - 1, start_l, start_c, ERROR_INCORRECT_FILENAME);
459 if (v[0] == delim) {
460 while (v[0] == delim && v[1] == delim) {
461 SCAN_ERROR (n == BUFFER_SIZE - 1, start_l, start_c, ERROR_INCORRECT_FILENAME);
462 fnb[n++] = delim;
463 fnb[n] = NULL_CHAR;
464 v += 2;
465 }
466 } else if (IS_PRINT (v[0])) {
467 fnb[n++] = *(v++);
468 fnb[n] = NULL_CHAR;
469 } else {
470 SCAN_ERROR (A68G_TRUE, start_l, start_c, ERROR_INCORRECT_FILENAME);
471 }
472 } while (v[0] != delim);
473 // Insist that the pragmat is closed properly.
474 v = &v[1];
475 SCAN_ERROR (!skip_pragmat (&u, &v, pr_lim, A68G_TRUE), start_l, start_c, ERROR_UNTERMINATED_PRAGMAT);
476 SCAN_ERROR (n == 0, start_l, start_c, ERROR_INCORRECT_FILENAME);
477 // Make the name relative to the position of the source file (C preprocessor standard).
478 if (FILENAME (u) != NO_TEXT) {
479 fn = a68g_relpath (a68g_dirname (FILENAME (u)), a68g_dirname (fnb), a68g_basename (fnb));
480 } else {
481 fn = a68g_relpath (FILE_PATH (&A68G_JOB), a68g_dirname (fnb), a68g_basename (fnb));
482 }
483 // Do not check errno, since errno may be undefined here after a successful call.
484 if (fn != NO_TEXT) {
485 a68g_bufcpy (fnb, fn, BUFFER_SIZE);
486 } else {
487 SCAN_ERROR_INFO (A68G_TRUE, NO_LINE, NO_TEXT, ERROR_SOURCE_FILE_INCLUDE_OPEN, fnb);
488 }
489 size_t fnwid = strlen (fnb) + 1;
490 fn = (char *) get_fixed_heap_space ((size_t) fnwid);
491 a68g_bufcpy (fn, fnb, fnwid);
492 // Ignore the file when included more than once.
493 for (t = top; t != NO_LINE; t = NEXT (t)) {
494 if (strcmp (FILENAME (t), fn) == 0) {
495 goto search_next_pragmat;
496 }
497 }
498 // Access the file.
499 errno = 0;
500 fd = open (fn, O_RDONLY | O_BINARY);
501 SCAN_ERROR_INFO (fd == -1, start_l, start_c, ERROR_SOURCE_FILE_INCLUDE_OPEN, fnb);
502 errno = 0;
503 fsize = (int) lseek (fd, 0, SEEK_END);
504 ASSERT (fsize >= 0);
505 SCAN_ERROR (errno != 0, start_l, start_c, ERROR_FILE_READ);
506 fbuf = (char *) get_temp_heap_space ((unt) (8 + fsize));
507 errno = 0;
508 ASSERT (lseek (fd, 0, SEEK_SET) >= 0);
509 SCAN_ERROR (errno != 0, start_l, start_c, ERROR_FILE_READ);
510 errno = 0;
511 bytes_read = (int) io_read (fd, fbuf, (size_t) fsize);
512 SCAN_ERROR (errno != 0 || bytes_read != fsize, start_l, start_c, ERROR_FILE_READ);
513 // Buffer still usable?.
514 if (fsize > A68G_PARSER (max_scan_buf_length)) {
515 A68G_PARSER (max_scan_buf_length) = fsize;
516 A68G_PARSER (scan_buf) = (char *) get_temp_heap_space ((unt) (8 + A68G_PARSER (max_scan_buf_length)));
517 }
518 // Link all lines into the list.
519 line_num = 1;
520 s = u;
521 t = PREVIOUS (u);
522 k = 0;
523 if (fsize == 0) {
524 // If file is empty, insert single empty line.
525 A68G_PARSER (scan_buf)[0] = NEWLINE_CHAR;
526 A68G_PARSER (scan_buf)[1] = NULL_CHAR;
527 append_source_line (A68G_PARSER (scan_buf), &t, &line_num, fn);
528 } else
529 while (k < fsize) {
530 n = 0;
531 A68G_PARSER (scan_buf)[0] = NULL_CHAR;
532 while (k < fsize && fbuf[k] != NEWLINE_CHAR) {
533 SCAN_ERROR ((IS_CNTRL (fbuf[k]) && !IS_SPACE (fbuf[k])) || fbuf[k] == STOP_CHAR, start_l, start_c, ERROR_FILE_INCLUDE_CTRL);
534 A68G_PARSER (scan_buf)[n++] = fbuf[k++];
535 A68G_PARSER (scan_buf)[n] = NULL_CHAR;
536 }
537 A68G_PARSER (scan_buf)[n++] = NEWLINE_CHAR;
538 A68G_PARSER (scan_buf)[n] = NULL_CHAR;
539 if (k < fsize) {
540 k++;
541 }
542 append_source_line (A68G_PARSER (scan_buf), &t, &line_num, fn);
543 }
544 // Conclude and go find another include directive, if any.
545 NEXT (t) = s;
546 PREVIOUS (s) = t;
547 concatenate_lines (top);
548 ASSERT (close (fd) == 0);
549 make_pass = A68G_TRUE;
550 }
551 search_next_pragmat:_SKIP_;
552 }
553 }
554 }
555
556 //! @brief Size of source file.
557
558 int get_source_size (void)
559 {
560 FILE_T f = FILE_SOURCE_FD (&A68G_JOB);
561 // This is why WIN32 must open as "read binary".
562 return (int) lseek (f, 0, SEEK_END);
563 }
564
565 //! @brief Append environment source lines.
566
567 void append_environ (char *str[], LINE_T ** ref_l, int *line_num, char *name)
568 {
569 for (int k = 0; str[k] != NO_TEXT; k++) {
570 int zero_line_num = 0;
571 (void) line_num;
572 append_source_line (str[k], ref_l, &zero_line_num, name);
573 }
574 }
575
576 //! @brief Read script file and make internal copy.
577
578 BOOL_T read_script_file (void)
579 {
580 LINE_T *ref_l = NO_LINE;
581 int k, n, num;
582 unt len;
583 BOOL_T file_end = A68G_FALSE;
584 BUFFER filename, linenum;
585 char ch, *fn, *line;
586 char *buffer = (char *) get_temp_heap_space ((unt) (8 + A68G_PARSER (source_file_size)));
587 FILE_T source = FILE_SOURCE_FD (&A68G_JOB);
588 ABEND (source == -1, ERROR_ACTION, __func__);
589 buffer[0] = NULL_CHAR;
590 n = 0;
591 len = (unt) (8 + A68G_PARSER (source_file_size));
592 buffer = (char *) get_temp_heap_space (len);
593 ASSERT (lseek (source, 0, SEEK_SET) >= 0);
594 while (!file_end) {
595 // Read the original file name.
596 filename[0] = NULL_CHAR;
597 k = 0;
598 if (io_read (source, &ch, 1) == 0) {
599 file_end = A68G_TRUE;
600 continue;
601 }
602 while (ch != NEWLINE_CHAR) {
603 filename[k++] = ch;
604 ASSERT (io_read (source, &ch, 1) == 1);
605 }
606 filename[k] = NULL_CHAR;
607 fn = TEXT (add_token (&A68G (top_token), filename));
608 // Read the original file number.
609 linenum[0] = NULL_CHAR;
610 k = 0;
611 ASSERT (io_read (source, &ch, 1) == 1);
612 while (ch != NEWLINE_CHAR) {
613 linenum[k++] = ch;
614 ASSERT (io_read (source, &ch, 1) == 1);
615 }
616 linenum[k] = NULL_CHAR;
617 num = (int) strtol (linenum, NO_REF, 10);
618 ABEND (errno == ERANGE, ERROR_INTERNAL_CONSISTENCY, __func__);
619 // COPY original line into buffer.
620 ASSERT (io_read (source, &ch, 1) == 1);
621 line = &buffer[n];
622 while (ch != NEWLINE_CHAR) {
623 buffer[n++] = ch;
624 ASSERT (io_read (source, &ch, 1) == 1);
625 ABEND ((unt) n >= len, ERROR_ACTION, __func__);
626 }
627 buffer[n++] = NEWLINE_CHAR;
628 buffer[n] = NULL_CHAR;
629 append_source_line (line, &ref_l, &num, fn);
630 }
631 return A68G_TRUE;
632 }
633
634 //! @brief match first non-white characters in string.
635
636 BOOL_T a68g_start(char *u, char *v, char **end)
637 {
638 *end = NO_TEXT;
639 while (v[0] != NULL_CHAR) {
640 if (u[0] == NULL_CHAR) {
641 return A68G_FALSE;
642 } else if (IS_SPACE (u[0])) {
643 u++;
644 } else {
645 if (u[0] == v[0]) {
646 u++;
647 v++;
648 if (end != NULL) {
649 *end = u;
650 }
651 } else {
652 return A68G_FALSE;
653 }
654 }
655 }
656 return A68G_TRUE;
657 }
658
659 //! @brief Read source file and make internal copy.
660
661 BOOL_T read_source_file (void)
662 {
663 LINE_T *ref_l = NO_LINE;
664 int line_num = 0, k, bytes_read;
665 ssize_t l;
666 FILE_T f = FILE_SOURCE_FD (&A68G_JOB);
667 char **prelude_start, **postlude, *buffer, *text;
668 // Read the file into a single buffer, so we save on system calls.
669 line_num = 1;
670 errno = 0;
671 text = (char *) get_temp_heap_space ((unt) (8 + A68G_PARSER (source_file_size)));
672 ABEND (errno != 0 || text == NO_TEXT, ERROR_ALLOCATION, __func__);
673 ASSERT (lseek (f, 0, SEEK_SET) >= 0);
674 ABEND (errno != 0, ERROR_ACTION, __func__);
675 errno = 0;
676 bytes_read = (int) io_read (f, text, (size_t) A68G_PARSER (source_file_size));
677 ABEND (errno != 0 || bytes_read != A68G_PARSER (source_file_size), ERROR_ACTION, __func__);
678 // Little test on stropping.
679 char *pr1 = "'PR'QUOTESTROPPING'PR'";
680 char *pr2 = "'PRAGMAT'QUOTESTROPPING'PRAGMAT'";
681 char *end = NO_TEXT;
682 if (a68g_start (text, pr1, &end)) {
683 OPTION_STROPPING (&A68G_JOB) = QUOTE_STROPPING;
684 buffer = end;
685 A68G_PARSER (source_file_size) = strlen (buffer);
686 } else if (a68g_start (text, pr2, &end)) {
687 OPTION_STROPPING (&A68G_JOB) = QUOTE_STROPPING;
688 buffer = end;
689 A68G_PARSER (source_file_size) = strlen (buffer);
690 } else {
691 buffer = text;
692 }
693 // Prelude.
694 if (OPTION_STROPPING (&A68G_JOB) == UPPER_STROPPING) {
695 prelude_start = bold_prelude_start;
696 postlude = bold_postlude;
697 } else if (OPTION_STROPPING (&A68G_JOB) == QUOTE_STROPPING) {
698 prelude_start = quote_prelude_start;
699 postlude = quote_postlude;
700 } else {
701 prelude_start = postlude = NO_REF;
702 }
703 append_environ (prelude_start, &ref_l, &line_num, "prelude");
704 // Link all lines into the list.
705 k = 0;
706 while (k < A68G_PARSER (source_file_size)) {
707 l = 0;
708 A68G_PARSER (scan_buf)[0] = NULL_CHAR;
709 while (k < A68G_PARSER (source_file_size) && buffer[k] != NEWLINE_CHAR) {
710 if (k < A68G_PARSER (source_file_size) - 1 && buffer[k] == CR_CHAR && buffer[k + 1] == NEWLINE_CHAR) {
711 k++;
712 } else {
713 A68G_PARSER (scan_buf)[l++] = buffer[k++];
714 A68G_PARSER (scan_buf)[l] = NULL_CHAR;
715 }
716 }
717 A68G_PARSER (scan_buf)[l++] = NEWLINE_CHAR;
718 A68G_PARSER (scan_buf)[l] = NULL_CHAR;
719 if (k < A68G_PARSER (source_file_size)) {
720 k++;
721 }
722 append_source_line (A68G_PARSER (scan_buf), &ref_l, &line_num, FILE_SOURCE_NAME (&A68G_JOB));
723 SCAN_ERROR (l != strlen (A68G_PARSER (scan_buf)), NO_LINE, NO_TEXT, ERROR_FILE_SOURCE_CTRL);
724 }
725 // Postlude.
726 append_environ (postlude, &ref_l, &line_num, "postlude");
727 // Concatenate lines.
728 concatenate_lines (TOP_LINE (&A68G_JOB));
729 // Include files.
730 include_files (TOP_LINE (&A68G_JOB));
731 return A68G_TRUE;
732 }
733
734 //! @brief Next_char get next character from internal copy of source file.
735
736 char next_char (LINE_T ** ref_l, char **ref_s, BOOL_T allow_typo)
737 {
738 char ch;
739 #if defined (NO_TYPO)
740 allow_typo = A68G_FALSE;
741 #endif
742 LOW_STACK_ALERT (NO_NODE);
743 // Source empty?.
744 if (*ref_l == NO_LINE) {
745 return STOP_CHAR;
746 } else {
747 LIST (*ref_l) = (BOOL_T) (OPTION_NODEMASK (&A68G_JOB) & SOURCE_MASK ? A68G_TRUE : A68G_FALSE);
748 // Take new line?.
749 if ((*ref_s)[0] == NEWLINE_CHAR || (*ref_s)[0] == NULL_CHAR) {
750 *ref_l = NEXT (*ref_l);
751 if (*ref_l == NO_LINE) {
752 return STOP_CHAR;
753 }
754 *ref_s = STRING (*ref_l);
755 } else {
756 (*ref_s)++;
757 }
758 // Deliver next char.
759 ch = (*ref_s)[0];
760 if (allow_typo && (IS_SPACE (ch) || ch == FORMFEED_CHAR)) {
761 ch = next_char (ref_l, ref_s, allow_typo);
762 }
763 return ch;
764 }
765 }
766
767 //! @brief Find first character that can start a valid symbol.
768
769 void get_good_char (char *ref_c, LINE_T ** ref_l, char **ref_s)
770 {
771 while (*ref_c != STOP_CHAR && (IS_SPACE (*ref_c) || (*ref_c == NULL_CHAR))) {
772 if (*ref_l != NO_LINE) {
773 LIST (*ref_l) = (BOOL_T) (OPTION_NODEMASK (&A68G_JOB) & SOURCE_MASK ? A68G_TRUE : A68G_FALSE);
774 }
775 *ref_c = next_char (ref_l, ref_s, A68G_FALSE);
776 }
777 }
778
779 //! @brief Handle a pragment (pragmat or comment).
780
781 char *pragment (int type, LINE_T ** ref_l, char **ref_c)
782 {
783 size_t chars_in_buf;
784 #define INIT_BUFFER {chars_in_buf = 0; A68G_PARSER (scan_buf)[chars_in_buf] = NULL_CHAR;}
785 #define ADD_ONE_CHAR(ch) {A68G_PARSER (scan_buf)[chars_in_buf ++] = ch; A68G_PARSER (scan_buf)[chars_in_buf] = NULL_CHAR;}
786 //
787 char c = **ref_c, *term_s = NO_TEXT, *start_c = *ref_c;
788 char *z = NO_TEXT;
789 LINE_T *start_l = *ref_l;
790 BOOL_T stop, pragmat = A68G_FALSE;
791 // Set terminator.
792 if (OPTION_STROPPING (&A68G_JOB) == UPPER_STROPPING) {
793 if (type == STYLE_I_COMMENT_SYMBOL) {
794 term_s = "CO";
795 } else if (type == STYLE_II_COMMENT_SYMBOL) {
796 term_s = "#";
797 } else if (type == BOLD_COMMENT_SYMBOL) {
798 term_s = "COMMENT";
799 } else if (type == STYLE_I_PRAGMAT_SYMBOL) {
800 term_s = "PR";
801 pragmat = A68G_TRUE;
802 } else if (type == BOLD_PRAGMAT_SYMBOL) {
803 term_s = "PRAGMAT";
804 pragmat = A68G_TRUE;
805 }
806 } else if (OPTION_STROPPING (&A68G_JOB) == QUOTE_STROPPING) {
807 if (type == STYLE_I_COMMENT_SYMBOL) {
808 term_s = "'CO'";
809 } else if (type == STYLE_II_COMMENT_SYMBOL) {
810 term_s = "#";
811 } else if (type == BOLD_COMMENT_SYMBOL) {
812 term_s = "'COMMENT'";
813 } else if (type == STYLE_I_PRAGMAT_SYMBOL) {
814 term_s = "'PR'";
815 pragmat = A68G_TRUE;
816 } else if (type == BOLD_PRAGMAT_SYMBOL) {
817 term_s = "'PRAGMAT'";
818 pragmat = A68G_TRUE;
819 }
820 }
821 size_t term_s_length = strlen (term_s);
822 // Scan for terminator.
823 INIT_BUFFER;
824 stop = A68G_FALSE;
825 while (stop == A68G_FALSE) {
826 BOOL_T scan_next = A68G_TRUE;
827 SCAN_ERROR (c == STOP_CHAR, start_l, start_c, ERROR_UNTERMINATED_PRAGMENT);
828 // A ".." or '..' delimited string in a PRAGMAT.
829 if (pragmat && (c == QUOTE_CHAR || (c == '\'' && OPTION_STROPPING (&A68G_JOB) == UPPER_STROPPING))) {
830 char delim = c;
831 BOOL_T eos = A68G_FALSE;
832 ADD_ONE_CHAR (c);
833 c = next_char (ref_l, ref_c, A68G_FALSE);
834 while (!eos) {
835 SCAN_ERROR (EOL (c), start_l, start_c, ERROR_LONG_STRING);
836 if (c == delim) {
837 ADD_ONE_CHAR (delim);
838 save_state (*ref_l, *ref_c, c);
839 c = next_char (ref_l, ref_c, A68G_FALSE);
840 if (c == delim) {
841 c = next_char (ref_l, ref_c, A68G_FALSE);
842 } else {
843 restore_state (ref_l, ref_c, &c);
844 eos = A68G_TRUE;
845 }
846 } else if (IS_PRINT (c)) {
847 ADD_ONE_CHAR (c);
848 c = next_char (ref_l, ref_c, A68G_FALSE);
849 } else {
850 unworthy (start_l, start_c, c);
851 }
852 }
853 } else if (EOL (c)) {
854 ADD_ONE_CHAR (NEWLINE_CHAR);
855 } else if (IS_UPPER (c)) {
856 while (IS_UPPER (c)) {
857 ADD_ONE_CHAR (c);
858 c = next_char (ref_l, ref_c, A68G_FALSE);
859 }
860 scan_next = A68G_FALSE;
861 } else if (IS_PRINT (c) || IS_SPACE (c)) {
862 ADD_ONE_CHAR (c);
863 }
864 if (chars_in_buf >= term_s_length) {
865 // Check whether we encountered the terminator.
866 char *tok = &(A68G_PARSER (scan_buf)[chars_in_buf - term_s_length]);
867 stop = (BOOL_T) (strcmp (term_s, tok) == 0);
868 }
869 if (scan_next) {
870 c = next_char (ref_l, ref_c, A68G_FALSE);
871 }
872 }
873 A68G_PARSER (scan_buf)[chars_in_buf - term_s_length] = NULL_CHAR;
874 z = new_string (term_s, A68G_PARSER (scan_buf), term_s, NO_TEXT);
875 if (type == STYLE_I_PRAGMAT_SYMBOL || type == BOLD_PRAGMAT_SYMBOL) {
876 isolate_options (A68G_PARSER (scan_buf), start_l);
877 }
878 return z;
879 #undef ADD_ONE_CHAR
880 #undef INIT_BUFFER
881 }
882
883 //! @brief Attribute for format item.
884
885 int get_format_item (char ch)
886 {
887 switch (TO_LOWER (ch)) {
888 case 'a': {
889 return FORMAT_ITEM_A;
890 }
891 case 'b': {
892 return FORMAT_ITEM_B;
893 }
894 case 'c': {
895 return FORMAT_ITEM_C;
896 }
897 case 'd': {
898 return FORMAT_ITEM_D;
899 }
900 case 'e': {
901 return FORMAT_ITEM_E;
902 }
903 case 'f': {
904 return FORMAT_ITEM_F;
905 }
906 case 'g': {
907 return FORMAT_ITEM_G;
908 }
909 case 'h': {
910 return FORMAT_ITEM_H;
911 }
912 case 'i': {
913 return FORMAT_ITEM_I;
914 }
915 case 'j': {
916 return FORMAT_ITEM_J;
917 }
918 case 'k': {
919 return FORMAT_ITEM_K;
920 }
921 case 'l':
922 case '/': {
923 return FORMAT_ITEM_L;
924 }
925 case 'm': {
926 return FORMAT_ITEM_M;
927 }
928 case 'n': {
929 return FORMAT_ITEM_N;
930 }
931 case 'o': {
932 return FORMAT_ITEM_O;
933 }
934 case 'p': {
935 return FORMAT_ITEM_P;
936 }
937 case 'q': {
938 return FORMAT_ITEM_Q;
939 }
940 case 'r': {
941 return FORMAT_ITEM_R;
942 }
943 case 's': {
944 return FORMAT_ITEM_S;
945 }
946 case 't': {
947 return FORMAT_ITEM_T;
948 }
949 case 'u': {
950 return FORMAT_ITEM_U;
951 }
952 case 'v': {
953 return FORMAT_ITEM_V;
954 }
955 case 'w': {
956 return FORMAT_ITEM_W;
957 }
958 case 'x': {
959 return FORMAT_ITEM_X;
960 }
961 case 'y': {
962 return FORMAT_ITEM_Y;
963 }
964 case 'z': {
965 return FORMAT_ITEM_Z;
966 }
967 case '+': {
968 return FORMAT_ITEM_PLUS;
969 }
970 case '-': {
971 return FORMAT_ITEM_MINUS;
972 }
973 case POINT_CHAR: {
974 return FORMAT_ITEM_POINT;
975 }
976 case '%': {
977 return FORMAT_ITEM_ESCAPE;
978 }
979 default: {
980 return 0;
981 }
982 }
983 }
984
985 //! @brief Whether input shows exponent character.
986
987 BOOL_T is_exp_char (LINE_T ** ref_l, char **ref_s, char *ch)
988 {
989 BOOL_T ret = A68G_FALSE;
990 char exp_syms[3];
991 if (OPTION_STROPPING (&A68G_JOB) == UPPER_STROPPING) {
992 exp_syms[0] = EXPONENT_CHAR;
993 exp_syms[1] = (char) TO_UPPER (EXPONENT_CHAR);
994 exp_syms[2] = NULL_CHAR;
995 } else {
996 exp_syms[0] = (char) TO_UPPER (EXPONENT_CHAR);
997 exp_syms[1] = BACKSLASH_CHAR;
998 exp_syms[2] = NULL_CHAR;
999 }
1000 save_state (*ref_l, *ref_s, *ch);
1001 if (strchr (exp_syms, *ch) != NO_TEXT) {
1002 *ch = next_char (ref_l, ref_s, A68G_TRUE);
1003 ret = (BOOL_T) (strchr ("+-0123456789", *ch) != NO_TEXT);
1004 }
1005 restore_state (ref_l, ref_s, ch);
1006 return ret;
1007 }
1008
1009 //! @brief Whether input shows radix character.
1010
1011 BOOL_T is_radix_char (LINE_T ** ref_l, char **ref_s, char *ch)
1012 {
1013 BOOL_T ret = A68G_FALSE;
1014 save_state (*ref_l, *ref_s, *ch);
1015 if (OPTION_STROPPING (&A68G_JOB) == QUOTE_STROPPING) {
1016 if (*ch == TO_UPPER (RADIX_CHAR)) {
1017 *ch = next_char (ref_l, ref_s, A68G_TRUE);
1018 ret = (BOOL_T) (strchr ("0123456789ABCDEF", *ch) != NO_TEXT);
1019 }
1020 } else {
1021 if (*ch == RADIX_CHAR) {
1022 *ch = next_char (ref_l, ref_s, A68G_TRUE);
1023 ret = (BOOL_T) (strchr ("0123456789abcdef", *ch) != NO_TEXT);
1024 }
1025 }
1026 restore_state (ref_l, ref_s, ch);
1027 return ret;
1028 }
1029
1030 //! @brief Whether input shows decimal point.
1031
1032 BOOL_T is_decimal_point (LINE_T ** ref_l, char **ref_s, char *ch)
1033 {
1034 BOOL_T ret = A68G_FALSE;
1035 save_state (*ref_l, *ref_s, *ch);
1036 if (*ch == POINT_CHAR) {
1037 char exp_syms[3];
1038 if (OPTION_STROPPING (&A68G_JOB) == UPPER_STROPPING) {
1039 exp_syms[0] = EXPONENT_CHAR;
1040 exp_syms[1] = (char) TO_UPPER (EXPONENT_CHAR);
1041 exp_syms[2] = NULL_CHAR;
1042 } else {
1043 exp_syms[0] = (char) TO_UPPER (EXPONENT_CHAR);
1044 exp_syms[1] = BACKSLASH_CHAR;
1045 exp_syms[2] = NULL_CHAR;
1046 }
1047 *ch = next_char (ref_l, ref_s, A68G_TRUE);
1048 if (strchr (exp_syms, *ch) != NO_TEXT) {
1049 *ch = next_char (ref_l, ref_s, A68G_TRUE);
1050 ret = (BOOL_T) (strchr ("+-0123456789", *ch) != NO_TEXT);
1051 } else {
1052 ret = (BOOL_T) (strchr ("0123456789", *ch) != NO_TEXT);
1053 }
1054 }
1055 restore_state (ref_l, ref_s, ch);
1056 return ret;
1057 }
1058
1059 //! @brief Get next token from internal copy of source file..
1060
1061 void get_next_token (BOOL_T in_format, LINE_T ** ref_l, char **ref_s, LINE_T ** start_l, char **start_c, int *att)
1062 {
1063 char c = **ref_s, *sym = A68G_PARSER (scan_buf);
1064 sym[0] = NULL_CHAR;
1065 get_good_char (&c, ref_l, ref_s);
1066 *start_l = *ref_l;
1067 *start_c = *ref_s;
1068 if (c == STOP_CHAR) {
1069 // We are at EOF.
1070 (sym++)[0] = STOP_CHAR;
1071 sym[0] = NULL_CHAR;
1072 return;
1073 }
1074 // In a format.
1075 if (in_format) {
1076 char *format_items;
1077 if (OPTION_STROPPING (&A68G_JOB) == UPPER_STROPPING) {
1078 format_items = "/%\\+-.abcdefghijklmnopqrstuvwxyz";
1079 } else if (OPTION_STROPPING (&A68G_JOB) == QUOTE_STROPPING) {
1080 format_items = "/%\\+-.ABCDEFGHIJKLMNOPQRSTUVWXYZ";
1081 } else {
1082 format_items = "/%\\+-.abcdefghijklmnopqrstuvwxyz";
1083 }
1084 if (strchr (format_items, c) != NO_TEXT) {
1085 // General format items.
1086 (sym++)[0] = c;
1087 sym[0] = NULL_CHAR;
1088 *att = get_format_item (c);
1089 (void) next_char (ref_l, ref_s, A68G_FALSE);
1090 return;
1091 }
1092 if (IS_DIGIT (c)) {
1093 // INT denotation for static replicator.
1094 SCAN_DIGITS (c);
1095 sym[0] = NULL_CHAR;
1096 *att = STATIC_REPLICATOR;
1097 return;
1098 }
1099 }
1100 // Not in a format.
1101 if (IS_UPPER (c)) {
1102 if (OPTION_STROPPING (&A68G_JOB) == UPPER_STROPPING) {
1103 // Upper case word - bold tag.
1104 while (IS_UPPER (c) || c == '_') {
1105 (sym++)[0] = c;
1106 c = next_char (ref_l, ref_s, A68G_FALSE);
1107 }
1108 sym[0] = NULL_CHAR;
1109 *att = BOLD_TAG;
1110 } else if (OPTION_STROPPING (&A68G_JOB) == QUOTE_STROPPING) {
1111 while (IS_UPPER (c) || IS_DIGIT (c) || c == '_') {
1112 (sym++)[0] = c;
1113 c = next_char (ref_l, ref_s, A68G_TRUE);
1114 }
1115 sym[0] = NULL_CHAR;
1116 *att = IDENTIFIER;
1117 }
1118 } else if (c == '\'') {
1119 // Quote, uppercase word, quote - bold tag.
1120 int k = 0;
1121 c = next_char (ref_l, ref_s, A68G_FALSE);
1122 while (IS_UPPER (c) || IS_DIGIT (c) || c == '_') {
1123 (sym++)[0] = c;
1124 k++;
1125 c = next_char (ref_l, ref_s, A68G_TRUE);
1126 }
1127 SCAN_ERROR (k == 0, *start_l, *start_c, ERROR_QUOTED_BOLD_TAG);
1128 sym[0] = NULL_CHAR;
1129 *att = BOLD_TAG;
1130 // Skip terminating quote, or complain if it is not there.
1131 SCAN_ERROR (c != '\'', *start_l, *start_c, ERROR_QUOTED_BOLD_TAG);
1132 c = next_char (ref_l, ref_s, A68G_FALSE);
1133 } else if (IS_LOWER (c)) {
1134 // Lower case word - identifier.
1135 while (IS_LOWER (c) || IS_DIGIT (c) || c == '_') {
1136 (sym++)[0] = c;
1137 c = next_char (ref_l, ref_s, A68G_TRUE);
1138 }
1139 sym[0] = NULL_CHAR;
1140 *att = IDENTIFIER;
1141 } else if (c == POINT_CHAR) {
1142 // Begins with a point symbol - point, dotdot, L REAL denotation.
1143 if (is_decimal_point (ref_l, ref_s, &c)) {
1144 (sym++)[0] = '0';
1145 (sym++)[0] = POINT_CHAR;
1146 c = next_char (ref_l, ref_s, A68G_TRUE);
1147 SCAN_DIGITS (c);
1148 if (is_exp_char (ref_l, ref_s, &c)) {
1149 SCAN_EXPONENT_PART (c);
1150 }
1151 sym[0] = NULL_CHAR;
1152 *att = REAL_DENOTATION;
1153 } else {
1154 c = next_char (ref_l, ref_s, A68G_TRUE);
1155 if (c == POINT_CHAR) {
1156 (sym++)[0] = POINT_CHAR;
1157 (sym++)[0] = POINT_CHAR;
1158 sym[0] = NULL_CHAR;
1159 *att = DOTDOT_SYMBOL;
1160 c = next_char (ref_l, ref_s, A68G_FALSE);
1161 } else {
1162 (sym++)[0] = POINT_CHAR;
1163 sym[0] = NULL_CHAR;
1164 *att = POINT_SYMBOL;
1165 }
1166 }
1167 } else if (IS_DIGIT (c)) {
1168 // Something that begins with a digit - L INT denotation, L REAL denotation.
1169 SCAN_DIGITS (c);
1170 if (is_decimal_point (ref_l, ref_s, &c)) {
1171 c = next_char (ref_l, ref_s, A68G_TRUE);
1172 if (is_exp_char (ref_l, ref_s, &c)) {
1173 (sym++)[0] = POINT_CHAR;
1174 (sym++)[0] = '0';
1175 SCAN_EXPONENT_PART (c);
1176 *att = REAL_DENOTATION;
1177 } else {
1178 (sym++)[0] = POINT_CHAR;
1179 SCAN_DIGITS (c);
1180 if (is_exp_char (ref_l, ref_s, &c)) {
1181 SCAN_EXPONENT_PART (c);
1182 }
1183 *att = REAL_DENOTATION;
1184 }
1185 } else if (is_exp_char (ref_l, ref_s, &c)) {
1186 SCAN_EXPONENT_PART (c);
1187 *att = REAL_DENOTATION;
1188 } else if (is_radix_char (ref_l, ref_s, &c)) {
1189 (sym++)[0] = c;
1190 c = next_char (ref_l, ref_s, A68G_TRUE);
1191 if (OPTION_STROPPING (&A68G_JOB) == UPPER_STROPPING) {
1192 while (IS_DIGIT (c) || strchr ("abcdef", c) != NO_TEXT) {
1193 (sym++)[0] = c;
1194 c = next_char (ref_l, ref_s, A68G_TRUE);
1195 }
1196 } else {
1197 while (IS_DIGIT (c) || strchr ("ABCDEF", c) != NO_TEXT) {
1198 (sym++)[0] = c;
1199 c = next_char (ref_l, ref_s, A68G_TRUE);
1200 }
1201 }
1202 *att = BITS_DENOTATION;
1203 } else {
1204 *att = INT_DENOTATION;
1205 }
1206 sym[0] = NULL_CHAR;
1207 } else if (c == QUOTE_CHAR) {
1208 // STRING denotation.
1209 BOOL_T stop = A68G_FALSE;
1210 while (!stop) {
1211 c = next_char (ref_l, ref_s, A68G_FALSE);
1212 while (c != QUOTE_CHAR && c != STOP_CHAR) {
1213 SCAN_ERROR (EOL (c), *start_l, *start_c, ERROR_LONG_STRING);
1214 (sym++)[0] = c;
1215 c = next_char (ref_l, ref_s, A68G_FALSE);
1216 }
1217 SCAN_ERROR (*ref_l == NO_LINE, *start_l, *start_c, ERROR_UNTERMINATED_STRING);
1218 c = next_char (ref_l, ref_s, A68G_FALSE);
1219 if (c == QUOTE_CHAR) {
1220 (sym++)[0] = QUOTE_CHAR;
1221 } else {
1222 stop = A68G_TRUE;
1223 }
1224 }
1225 sym[0] = NULL_CHAR;
1226 *att = (in_format ? LITERAL : ROW_CHAR_DENOTATION);
1227 } else if (strchr ("#$()[]{},;@", c) != NO_TEXT) {
1228 // Single character symbols.
1229 (sym++)[0] = c;
1230 (void) next_char (ref_l, ref_s, A68G_FALSE);
1231 sym[0] = NULL_CHAR;
1232 *att = 0;
1233 } else if (c == '|') {
1234 // Bar.
1235 (sym++)[0] = c;
1236 c = next_char (ref_l, ref_s, A68G_FALSE);
1237 if (c == ':') {
1238 (sym++)[0] = c;
1239 (void) next_char (ref_l, ref_s, A68G_FALSE);
1240 }
1241 sym[0] = NULL_CHAR;
1242 *att = 0;
1243 } else if (c == '!' && OPTION_STROPPING (&A68G_JOB) == QUOTE_STROPPING) {
1244 // Bar, will be replaced with modern variant.
1245 // For this reason ! is not a MONAD with quote-stropping.
1246 (sym++)[0] = '|';
1247 c = next_char (ref_l, ref_s, A68G_FALSE);
1248 if (c == ':') {
1249 (sym++)[0] = c;
1250 (void) next_char (ref_l, ref_s, A68G_FALSE);
1251 }
1252 sym[0] = NULL_CHAR;
1253 *att = 0;
1254 } else if (c == ':') {
1255 // Colon, semicolon, IS, ISNT.
1256 (sym++)[0] = c;
1257 c = next_char (ref_l, ref_s, A68G_FALSE);
1258 if (c == '=') {
1259 (sym++)[0] = c;
1260 if ((c = next_char (ref_l, ref_s, A68G_FALSE)) == ':') {
1261 (sym++)[0] = c;
1262 c = next_char (ref_l, ref_s, A68G_FALSE);
1263 }
1264 } else if (c == '/') {
1265 (sym++)[0] = c;
1266 if ((c = next_char (ref_l, ref_s, A68G_FALSE)) == '=') {
1267 (sym++)[0] = c;
1268 if ((c = next_char (ref_l, ref_s, A68G_FALSE)) == ':') {
1269 (sym++)[0] = c;
1270 c = next_char (ref_l, ref_s, A68G_FALSE);
1271 }
1272 }
1273 } else if (c == ':') {
1274 (sym++)[0] = c;
1275 if ((c = next_char (ref_l, ref_s, A68G_FALSE)) == '=') {
1276 (sym++)[0] = c;
1277 }
1278 }
1279 sym[0] = NULL_CHAR;
1280 *att = 0;
1281 } else if (c == '=') {
1282 // Operator starting with "=".
1283 char *scanned = sym;
1284 (sym++)[0] = c;
1285 c = next_char (ref_l, ref_s, A68G_FALSE);
1286 if (strchr (NOMADS, c) != NO_TEXT) {
1287 (sym++)[0] = c;
1288 c = next_char (ref_l, ref_s, A68G_FALSE);
1289 }
1290 if (c == '=') {
1291 (sym++)[0] = c;
1292 if (next_char (ref_l, ref_s, A68G_FALSE) == ':') {
1293 (sym++)[0] = ':';
1294 c = next_char (ref_l, ref_s, A68G_FALSE);
1295 if (strlen (sym) < 4 && c == '=') {
1296 (sym++)[0] = '=';
1297 (void) next_char (ref_l, ref_s, A68G_FALSE);
1298 }
1299 }
1300 } else if (c == ':') {
1301 (sym++)[0] = c;
1302 sym[0] = NULL_CHAR;
1303 if (next_char (ref_l, ref_s, A68G_FALSE) == '=') {
1304 (sym++)[0] = '=';
1305 (void) next_char (ref_l, ref_s, A68G_FALSE);
1306 } else {
1307 SCAN_ERROR (!(strcmp (scanned, "=:") == 0 || strcmp (scanned, "==:") == 0), *start_l, *start_c, ERROR_INVALID_OPERATOR_TAG);
1308 }
1309 }
1310 sym[0] = NULL_CHAR;
1311 if (strcmp (scanned, "=") == 0) {
1312 *att = EQUALS_SYMBOL;
1313 } else {
1314 *att = OPERATOR;
1315 }
1316 } else if (strchr (MONADS, c) != NO_TEXT || strchr (NOMADS, c) != NO_TEXT) {
1317 // Operator.
1318 char *scanned = sym;
1319 (sym++)[0] = c;
1320 c = next_char (ref_l, ref_s, A68G_FALSE);
1321 if (strchr (NOMADS, c) != NO_TEXT) {
1322 (sym++)[0] = c;
1323 c = next_char (ref_l, ref_s, A68G_FALSE);
1324 }
1325 if (c == '=') {
1326 (sym++)[0] = c;
1327 if (next_char (ref_l, ref_s, A68G_FALSE) == ':') {
1328 (sym++)[0] = ':';
1329 c = next_char (ref_l, ref_s, A68G_FALSE);
1330 if (strlen (scanned) < 4 && c == '=') {
1331 (sym++)[0] = '=';
1332 (void) next_char (ref_l, ref_s, A68G_FALSE);
1333 }
1334 }
1335 } else if (c == ':') {
1336 (sym++)[0] = c;
1337 sym[0] = NULL_CHAR;
1338 if (next_char (ref_l, ref_s, A68G_FALSE) == '=') {
1339 (sym++)[0] = '=';
1340 sym[0] = NULL_CHAR;
1341 (void) next_char (ref_l, ref_s, A68G_FALSE);
1342 } else {
1343 SCAN_ERROR (strcmp (&(scanned[1]), "=:") != 0, *start_l, *start_c, ERROR_INVALID_OPERATOR_TAG);
1344 }
1345 }
1346 sym[0] = NULL_CHAR;
1347 *att = OPERATOR;
1348 } else {
1349 // Afuuus ... strange characters!.
1350 unworthy (*start_l, *start_c, (int) c);
1351 }
1352 }
1353
1354 //! @brief Whether att opens an embedded clause.
1355
1356 BOOL_T open_nested_clause (int att)
1357 {
1358 switch (att) {
1359 case OPEN_SYMBOL:
1360 case BEGIN_SYMBOL:
1361 case PAR_SYMBOL:
1362 case IF_SYMBOL:
1363 case CASE_SYMBOL:
1364 case FOR_SYMBOL:
1365 case FROM_SYMBOL:
1366 case BY_SYMBOL:
1367 case TO_SYMBOL:
1368 case DOWNTO_SYMBOL:
1369 case WHILE_SYMBOL:
1370 case DO_SYMBOL:
1371 case SUB_SYMBOL:
1372 case ACCO_SYMBOL: {
1373 return A68G_TRUE;
1374 }
1375 }
1376 return A68G_FALSE;
1377 }
1378
1379 //! @brief Whether att closes an embedded clause.
1380
1381 BOOL_T close_nested_clause (int att)
1382 {
1383 switch (att) {
1384 case CLOSE_SYMBOL:
1385 case END_SYMBOL:
1386 case FI_SYMBOL:
1387 case ESAC_SYMBOL:
1388 case OD_SYMBOL:
1389 case BUS_SYMBOL:
1390 case OCCA_SYMBOL: {
1391 return A68G_TRUE;
1392 }
1393 }
1394 return A68G_FALSE;
1395 }
1396
1397 //! @brief Cast a string to lower case.
1398
1399 void make_lower_case (char *p)
1400 {
1401 for (; p != NO_TEXT && p[0] != NULL_CHAR; p++) {
1402 p[0] = (char) TO_LOWER (p[0]);
1403 }
1404 }
1405
1406 //! @brief Construct a linear list of tokens.
1407
1408 void tokenise_source (NODE_T ** root, int level, BOOL_T in_format, LINE_T ** l, char **s, LINE_T ** start_l, char **start_c)
1409 {
1410 char *lpr = NO_TEXT;
1411 int lprt = 0;
1412 while (l != NO_REF && !A68G_PARSER (stop_scanner)) {
1413 int att = 0;
1414 get_next_token (in_format, l, s, start_l, start_c, &att);
1415 if (A68G_PARSER (scan_buf)[0] == STOP_CHAR) {
1416 A68G_PARSER (stop_scanner) = A68G_TRUE;
1417 } else if (strlen (A68G_PARSER (scan_buf)) > 0 || att == ROW_CHAR_DENOTATION || att == LITERAL) {
1418 KEYWORD_T *kw;
1419 char *c = NO_TEXT;
1420 BOOL_T make_node = A68G_TRUE;
1421 char *trailing = NO_TEXT;
1422 if (att != IDENTIFIER) {
1423 kw = find_keyword (A68G (top_keyword), A68G_PARSER (scan_buf));
1424 } else {
1425 kw = NO_KEYWORD;
1426 }
1427 if (!(kw != NO_KEYWORD && att != ROW_CHAR_DENOTATION)) {
1428 if (att == IDENTIFIER) {
1429 make_lower_case (A68G_PARSER (scan_buf));
1430 }
1431 if (att != ROW_CHAR_DENOTATION && att != LITERAL) {
1432 size_t len = strlen (A68G_PARSER (scan_buf));
1433 while (len >= 1 && A68G_PARSER (scan_buf)[len - 1] == '_') {
1434 trailing = "_";
1435 A68G_PARSER (scan_buf)[len - 1] = NULL_CHAR;
1436 len--;
1437 }
1438 }
1439 c = TEXT (add_token (&A68G (top_token), A68G_PARSER (scan_buf)));
1440 } else {
1441 if (IS (kw, TO_SYMBOL)) {
1442 // Merge GO and TO to GOTO.
1443 if (*root != NO_NODE && IS (*root, GO_SYMBOL)) {
1444 ATTRIBUTE (*root) = GOTO_SYMBOL;
1445 NSYMBOL (*root) = TEXT (find_keyword (A68G (top_keyword), "GOTO"));
1446 make_node = A68G_FALSE;
1447 } else {
1448 att = ATTRIBUTE (kw);
1449 c = TEXT (kw);
1450 }
1451 } else {
1452 if (att == 0 || att == BOLD_TAG) {
1453 att = ATTRIBUTE (kw);
1454 }
1455 c = TEXT (kw);
1456 // Handle pragments.
1457 if (att == STYLE_II_COMMENT_SYMBOL || att == STYLE_I_COMMENT_SYMBOL || att == BOLD_COMMENT_SYMBOL) {
1458 char *nlpr = pragment (ATTRIBUTE (kw), l, s);
1459 if (lpr == NO_TEXT || strlen (lpr) == 0) {
1460 lpr = nlpr;
1461 } else {
1462 char *stale = lpr;
1463 lpr = new_string (lpr, "\n\n", nlpr, NO_TEXT);
1464 a68g_free (nlpr);
1465 a68g_free (stale);
1466 }
1467 lprt = att;
1468 make_node = A68G_FALSE;
1469 } else if (att == STYLE_I_PRAGMAT_SYMBOL || att == BOLD_PRAGMAT_SYMBOL) {
1470 char *nlpr = pragment (ATTRIBUTE (kw), l, s);
1471 if (lpr == NO_TEXT || strlen (lpr) == 0) {
1472 lpr = nlpr;
1473 } else {
1474 char *stale = lpr;
1475 lpr = new_string (lpr, "\n\n", nlpr, NO_TEXT);
1476 a68g_free (nlpr);
1477 a68g_free (stale);
1478 }
1479 lprt = att;
1480 if (!A68G_PARSER (stop_scanner)) {
1481 (void) set_options (OPTION_LIST (&A68G_JOB), A68G_FALSE);
1482 make_node = A68G_FALSE;
1483 }
1484 }
1485 }
1486 }
1487 // Add token to the tree.
1488 if (make_node) {
1489 NODE_T *q = new_node ();
1490 INFO (q) = new_node_info ();
1491 switch (att) {
1492 case ASSIGN_SYMBOL:
1493 case END_SYMBOL:
1494 case ESAC_SYMBOL:
1495 case OD_SYMBOL:
1496 case OF_SYMBOL:
1497 case FI_SYMBOL:
1498 case CLOSE_SYMBOL:
1499 case BUS_SYMBOL:
1500 case COLON_SYMBOL:
1501 case COMMA_SYMBOL:
1502 case DOTDOT_SYMBOL:
1503 case SEMI_SYMBOL: {
1504 GINFO (q) = NO_GINFO;
1505 break;
1506 }
1507 default: {
1508 GINFO (q) = new_genie_info ();
1509 break;
1510 }
1511 }
1512 STATUS (q) = OPTION_NODEMASK (&A68G_JOB);
1513 LINE (INFO (q)) = *start_l;
1514 CHAR_IN_LINE (INFO (q)) = *start_c;
1515 PRIO (INFO (q)) = 0;
1516 PROCEDURE_LEVEL (INFO (q)) = 0;
1517 ATTRIBUTE (q) = att;
1518 NSYMBOL (q) = c;
1519 PREVIOUS (q) = *root;
1520 SUB (q) = NEXT (q) = NO_NODE;
1521 TABLE (q) = NO_TABLE;
1522 MOID (q) = NO_MOID;
1523 TAX (q) = NO_TAG;
1524 if (lpr != NO_TEXT) {
1525 NPRAGMENT (q) = lpr;
1526 NPRAGMENT_TYPE (q) = lprt;
1527 lpr = NO_TEXT;
1528 lprt = 0;
1529 }
1530 if (*root != NO_NODE) {
1531 NEXT (*root) = q;
1532 }
1533 if (TOP_NODE (&A68G_JOB) == NO_NODE) {
1534 TOP_NODE (&A68G_JOB) = q;
1535 }
1536 *root = q;
1537 if (trailing != NO_TEXT) {
1538 diagnostic (A68G_WARNING, q, WARNING_TRAILING, trailing, att);
1539 }
1540 }
1541 // Redirection in tokenising formats. The scanner is a recursive-descent type as
1542 // to know when it scans a format text and when not.
1543 if (in_format && att == FORMAT_DELIMITER_SYMBOL) {
1544 return;
1545 } else if (!in_format && att == FORMAT_DELIMITER_SYMBOL) {
1546 tokenise_source (root, level + 1, A68G_TRUE, l, s, start_l, start_c);
1547 } else if (in_format && open_nested_clause (att)) {
1548 NODE_T *z = PREVIOUS (*root);
1549 if (z != NO_NODE && is_one_of (z, FORMAT_ITEM_N, FORMAT_ITEM_G, FORMAT_ITEM_H, FORMAT_ITEM_F, STOP)) {
1550 tokenise_source (root, level, A68G_FALSE, l, s, start_l, start_c);
1551 } else if (att == OPEN_SYMBOL) {
1552 ATTRIBUTE (*root) = FORMAT_OPEN_SYMBOL;
1553 } else if (OPTION_BRACKETS (&A68G_JOB) && att == SUB_SYMBOL) {
1554 ATTRIBUTE (*root) = FORMAT_OPEN_SYMBOL;
1555 } else if (OPTION_BRACKETS (&A68G_JOB) && att == ACCO_SYMBOL) {
1556 ATTRIBUTE (*root) = FORMAT_OPEN_SYMBOL;
1557 }
1558 } else if (!in_format && level > 0 && open_nested_clause (att)) {
1559 tokenise_source (root, level + 1, A68G_FALSE, l, s, start_l, start_c);
1560 } else if (!in_format && level > 0 && close_nested_clause (att)) {
1561 return;
1562 } else if (in_format && att == CLOSE_SYMBOL) {
1563 ATTRIBUTE (*root) = FORMAT_CLOSE_SYMBOL;
1564 } else if (OPTION_BRACKETS (&A68G_JOB) && in_format && att == BUS_SYMBOL) {
1565 ATTRIBUTE (*root) = FORMAT_CLOSE_SYMBOL;
1566 } else if (OPTION_BRACKETS (&A68G_JOB) && in_format && att == OCCA_SYMBOL) {
1567 ATTRIBUTE (*root) = FORMAT_CLOSE_SYMBOL;
1568 }
1569 }
1570 }
1571 }
1572
1573 //! @brief Tokenise source file, build initial syntax tree.
1574
1575 BOOL_T lexical_analyser (void)
1576 {
1577 LINE_T *l = NO_LINE, *start_l = NO_LINE;
1578 char *s = NO_TEXT, *start_c = NO_TEXT;
1579 NODE_T *root = NO_NODE;
1580 A68G_PARSER (scan_buf) = NO_TEXT;
1581 A68G_PARSER (max_scan_buf_length) = A68G_PARSER (source_file_size) = get_source_size ();
1582 // Errors in file?.
1583 if (A68G_PARSER (max_scan_buf_length) == 0) {
1584 return A68G_FALSE;
1585 }
1586 if (OPTION_RUN_SCRIPT (&A68G_JOB)) {
1587 A68G_PARSER (scan_buf) = (char *) get_temp_heap_space ((unt) (8 + A68G_PARSER (max_scan_buf_length)));
1588 if (!read_script_file ()) {
1589 return A68G_FALSE;
1590 }
1591 } else {
1592 A68G_PARSER (max_scan_buf_length) += KILOBYTE; // for the environ, more than enough
1593 A68G_PARSER (scan_buf) = (char *) get_temp_heap_space ((unt) A68G_PARSER (max_scan_buf_length));
1594 // Errors in file?.
1595 if (!read_source_file ()) {
1596 return A68G_FALSE;
1597 }
1598 }
1599 // Start tokenising.
1600 A68G_PARSER (read_error) = A68G_FALSE;
1601 A68G_PARSER (stop_scanner) = A68G_FALSE;
1602 if ((l = TOP_LINE (&A68G_JOB)) != NO_LINE) {
1603 s = STRING (l);
1604 }
1605 tokenise_source (&root, 0, A68G_FALSE, &l, &s, &start_l, &start_c);
1606 return A68G_TRUE;
1607 }
© 2002-2025 J.M. van der Veer (jmvdveer@xs4all.nl)
|