% Copyright 2012-2024, Alexander Shibakov
% This file is part of SPLinT
%
% SPLinT is free software: you can redistribute it and/or modify
% it under the terms of the GNU General Public License as published by
% the Free Software Foundation, either version 3 of the License, or
% (at your option) any later version.
%
% SPLinT is distributed in the hope that it will be useful,
% but WITHOUT ANY WARRANTY; without even the implied warranty of
% MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
% GNU General Public License for more details.
%
% You should have received a copy of the GNU General Public License
% along with SPLinT.  If not, see <http://www.gnu.org/licenses/>.

%\def\mkpurebyte{\uccode`\@=\yycp@\uppercase{\yybytepure{@}}\uccode`\@=`\@}

% make all symbol characters category 12; most macros are indifferent to this
% however, if delimited macros are used to process the mathched text, the option 
% makes it easier to write such macros;

\def\mkpurebyte{\uccode`\.=\yycp@\uppercase{\yybytepure{.}}\uccode`\.=`\.}

\def\yyinput{\futurelet\next\yyinp@t} % get the code of the next character ...

\def\yyinp@t{% test the code and decide whether to continue lexing the
             % token or return to the parser
  \ifcat\noexpand\next\bgroup 
      \yybreak\yyinputgroup
  \else
      \if\noexpand\next\space % code 32 character token is assumed to
% be an ordinary space (while category 10 tokens are not necessarily);
% the reason for this choice is the way code 32 characters are treated
% by \string (turned into `real' spaces); `funny' spaces of both kinds
% (a character that differs from a `real' space (\catcode`\ ==10
% \number`\ ==32) in either category or character code) can be
% created, however, category 10 characters of character code other
% than 32 are very rare (and take some effort to produce), while
% category 12, character code 32 tokens are poduced as a result of
% `sanitizing' input all the time; note that the spaces are processed
% at the lowest level of the input routine and thus cannot be
% substituted by the switches below; this choice is not bullet proof
% but should suffice for most uses; in the case of more polluted
% input, an extra `sanitation' step can be performed first; if
% desired, category 10 charactes can be assumed to be spaces at this
% point, as well, although this will introduce an extra test, affecting
% the (already poor) efficiency of the input macros.
          \yycp@=`\ % 
          \yybyte={ }%
          \mkpurebyte
          \yybreak@\yyskipspace % return the space
      \else
          \if\noexpand\next\eolletter % cannot use ^^M here since TeX will simply drop the rest of the line
              \yycp@=\n
              \yybyte={\n}%
              \mkpurebyte
              \yybreak@@\yyskipspace % return the end of line character
          \else
              \yybreak@@\yy@np@t
          \fi
      \fi
  \yycontinue
}

% some \Cee\ escape characters; the rest are either silly (like \a and \b) or
% are already defined to have other important functions in \TeX\ (such as \v and \t)

\chardef\n=`\^^J
\chardef\r=`\^^M
\chardef\f=`\^^L
\chardef\HT=`\^^I % ASCII horizontal tab
\chardef\charseq1
\chardef\charac2
\chardef\chargroup3

\def\yyskipspace{\afterassignment\yyreturn\let\next= }

\def\yyinputgroup#1{%
  \yycp@=\chargroup 
  \yybyte={{#1}}%
  \mkpurebyte
  \yyreturn
}

\newif\ifyyinputdebug

\def\yy@np@t#1{% start lexing:
               % whitespace (category 10) tokens are skipped
               % automatically
  \yybyte{#1}%
  \ifyyinputdebug
      \immediate\write16{read: \the\yybyte\space after: \the\yytext@seen}%
  \fi
  \ifx#1\ % a space token
      \yybreak\returnexplicitspace
  \else
      \expandafter\ischar\string#100\end 
      \ifchar % a single character (not a control sequence)
          \ifcat\noexpand#1\noexpand~% is it an active character?
              \yycp@=\charac % yes, return it
              \mkpurebyte
              \yybreak@@\yyreturn
          \else % it is a non-active character ...
                % ... or a control sequence with an empty name (obtained by \csname\endcsname or
                % \enlinechar=-1 \toks0={\
                % } this case can be handled by the switch below but is ignored for the moment)
              \let\default\achardefault
              \yybreak@@{\switchon{\the\yybyte}\in\acharswitch}%
          \fi
      \else % it is a control sequence, return it
          \expandafter\ischar\string#10\end 
          \ifchar % it is a one-char control sequence
              \let\default\onechardefault
              \yybreak@@{\switchon{\the\yybyte}\in\onecharswitch}%
          \else
              \let\default\multichardefault
              \yybreak@@{\switchon{\the\yybyte}\in\multicharswitch}%
          \fi
      \fi
  \yycontinue
}

\chardef\explicitspacecode=`\ %

\def\returnexplicitspace{%
      \yycp@=\explicitspacecode 
      \mkpurebyte
      \yyreturn % keep looking for a non-whitespace token
}

\def\onecharswitch{
    \raw \n \raw {%
        \yycp@=\n
        \mkpurebyte
        \yyreturn
    }
    \raw \^^M\raw {%
        \returnexplicitspace
    }
}

\def\onechardefault{%
    \expandafter\yycp@\expandafter`\the\yybyte\relax
    \mkpurebyte
    \yyreturn
}%

\def\multicharswitch{
    \raw\vb\raw {%
        \vbunwrap
    }
    \raw\insertraw\raw {%
        \insertrawnext
    }
    \raw\stashed\raw {%
        \stashnext
    }
    \raw\format \formatlocal\raw {%
        \formatnext
    }
    \raw\formatbegin\raw {%
        \fmtbegin
    }
    \raw\formatp\raw {%
        \fmtparam
    }
    \raw\sflush\raw {%
        \sflushnext
    }
    \raw\yyeof\raw {%
        \yycp@=\YYENDOFBUFFERCHAR\relax
        \yybytepure={}%
        \yyreturn
    }
    \raw\inputboundary\raw {%
        \inputboundarynext
    }
    \raw\flatten \resetf \inline \skipheader\raw {%
        \expandafter\yyinput\expandafter\format\expandafter{\the\yybyte}%
    }
    \raw\fold \breakline\raw {% 
        \expandafter\yyinput\expandafter\formatlocal\expandafter{\the\yybyte}%
    }
    \raw\breakahead\raw {% 
        \expandafter\yyinput\expandafter\formatp\the\yybyte
    }
    \raw\break\raw {% for testing purposes
        \yycp@=`\ %
        \yybytepure={ }% 
        \yyreturn
    }
    \raw\squashtermstrue\raw {%
        \yycp@=`\ %
        \yybytepure={ }% 
        \squashtermstrue
        \yyreturn
    }
    \raw\endparse \endparseinput\raw {%
        \errmessage{internal error: reading past the end of the input buffer}%
    }
}

\def\multichardefault{%
    \yycp@\charseq
    \mkpurebyte
    \yyreturn
}%

% the single character switch `normalizes' the typesetting of underscores, as this
% is a particularly tricky character to get right; in typewriter fonts there is a
% `first class' underscore character so \_ may be defined as \char`\_ whereas in other fonts
% there is no `native' underscore character so \_ may be defined as a rule of an appropriate
% size
\def\acharswitch{
    \raw_\raw\_ {% the name parser will replace _ with \_ in subscripts
        \yycp@`\_\relax
        \yybyte{\_}%
        \mkpurebyte
        \yyreturn
    }
}

\def\achardefault{%
    \expandafter\yycp@\expandafter`\the\yybyte\relax
    \mkpurebyte
    \yyreturn
}

% the following commands are only here for debugging purposes
% they slow down the input

\setspecialcharsfrom\onecharswitch
\setspecialcharsfrom\multicharswitch
\setspecialcharsfrom\acharswitch

% care should be taken with using the next command; it will only be executed
% if the parser (rather the lexer) `sees' it as part of the input; thus if this
% command is inserted at the end of the parsed text and, say, a bootstrap parser
% terminates early, the command will not be executed; likewise, it may be executed 
% (more than) twice in case, for example, a parser stack is used and both (several)
% parsers see it before the parsing is terminated. 

\def\insertrawnext#1{% insert a command
    #1\yyinput
}

\def\vbunwrap#1#2\vb{%
    \yyinput#1\stashed{#2}\vb
}

% stash and format streams (lists)

\chardef\stashchar=`\ %
\chardef\formatchar=`\ %

\newcount\stashmarker
\newcount\formatmarker

\def\yystash{[stash]}
\def\yyformat{[format]}

\def\stashnextwithspace#1{%
    \yybytepure{ }\yycp@\stashchar
    \yybyte\expandafter{\the\yybyte{#1}}%
    \advance\stashmarker\@ne
    \ifnum\stashmarker<\showlistcounter\yystash\relax
    \else
        \appendtolist\yystash{#1}%
    \fi
    \yyreturn
}

% the mechanism for stash processing making stash invisible

\def\stashnextwithnothing#1{%
    \yybyte\expandafter{\the\yybyte{#1}}\concat\yysubtext\yybyte
    \advance\stashmarker\@ne
    \ifnum\stashmarker<\showlistcounter\yystash\relax
    \else
        \appendtolist\yystash{#1}%
    \fi
    \ifyyinputdebug
        \immediate\write16{mid text: \the\yysubtext}%
    \fi
    \yyinput
}

% collect the stash in the stash list (stream)

\let\stashnext\stashnextwithnothing

\def\formatnext#1{%
    \yybytepure{ }\yycp@\formatchar
    \toksa{#1}%
    \advance\formatmarker\@ne
    \ifnum\formatmarker<\showlistcounter\yyformat\relax
    \else
        \expandafter\appendtolisti\expandafter\yyformat\expandafter{\the\yybyte{#1}}% insert the iterator sequence, as well
    \fi
    \yybyte\expandafter{\the\yybyte{#1}}%
    \yyreturn
}

\def\fmtbegin#1\fmtend{\formatnext{#1}} % multiparameter format sequences
\def\fmtparam#1#2{\formatnext{#1{#2}}} % single parameter format sequences

\chardef\boundarychar=`\ %

% the following is a minimal setup of a parsing boundary

\def\inputboundarynext#1{% `l' for left boundary
    \yybytepure{ }\yycp@\boundarychar
    \yybyte\expandafter{\the\yybyte{#1}}%
    \yyreturn % inserting something here will effectively insert it into the input stream
}

% the next macro is not implemented correctly (since after a backup the flushed
% tokens may be reinserted; there needs to be a mechanism to remember that the
% tokens are already in the stash stream

\def\sflushnext#1#2{% #1 is the marker
                    % #2 is the contents
    \yybytepure{ }\yycp@\stashchar
    % `save the flushed code' goes here
    \yybyte\expandafter{\the\yybyte{#1}{#2}}%
    \concat\yysubtext\yybyte
    \ifyyinputdebug
        \immediate\write16{mid text: \the\yysubtext}%
    \fi
    \yyreturn
}

\def\z@rotest{0}
\newif\ifchar

\def\ischar#1#2#3\end{% three parameters because #1 can be an
                      % \escapechar
    \def\lastnamechar{#3}%
    \ifx\lastnamechar\z@rotest\chartrue\else\charfalse\fi
}

% trivial input routine

\def\yyinputtrivial{\futurelet\next\yyinp@ttrivial} % get the code of the next character ...

\def\yyinp@ttrivial{
    \ifcat\noexpand\next\space % category 10 token
        \yycp@=`\ % 
        \yybyte={ }%
        \mkpurebyte
        \yybreak\yyskipspace % return the space
    \else
        \yybreak\yy@np@ttrivial
    \yycontinue
}

\def\yy@np@ttrivial#1{%
    \ifcat\noexpand#1a%
        \yycp@`#1%
        \yybyte{#1}%
        \yybytepure{#1}%
    \else
        \if\noexpand#1\eolletter
            \yycp@=\n
            \yybyte\expandafter{\eolletter}%
            \mkpurebyte
        \else
            \ifx#1\yyeof
                \yycp@=\YYENDOFBUFFERCHAR\relax
                \yybyte{#1}%
                \yybytepure={}%
            \else
                \yycp@\charseq
                \mkpurebyte
            \fi
        \fi
    \fi
    \yyreturn
}