// Author: maple // date: 9/24/25 package org.openautonomousconnection.htmlparser.interpreter; import org.openautonomousconnection.StringUtils_Remove_Please; import org.openautonomousconnection.htmlparser.Parser; import org.openautonomousconnection.htmlparser.TagManager; import org.openautonomousconnection.htmlparser.html.HTML; import org.openautonomousconnection.htmlparser.html.HTMLElement; import org.openautonomousconnection.htmlparser.interpreter.html.exception.ExpectStringException; import org.openautonomousconnection.htmlparser.interpreter.html.exception.UnexpectedTokenException; import org.openautonomousconnection.htmlparser.interpreter.html.state.HTMLState; import lombok.Getter; import org.openautonomousconnection.htmlparser.interpreter.script.ScriptInterpreter; import java.util.Arrays; import java.util.Map; import java.util.Stack; public class HTMLInterpreter implements Interpreter { @Getter private HTMLState currentState = HTMLState.TAG; // Used to go up a layer after comment is opened private HTMLState inbetweenState = HTMLState.COMMENT; @Getter private Parser parser; private TagManager tagManager; private Stack elementBuilders; private StringBuilder currentAttribute, currentValue, currentText, currentClosingTag; public int currentLine = 1; private HTMLElement currentElement; private ScriptInterpreter scriptInterpreter; public HTMLInterpreter(Parser parser, ScriptInterpreter scriptInterpreter) { this.parser = parser; this.tagManager = parser.getTagManager(); this.scriptInterpreter = scriptInterpreter; this.currentText = new StringBuilder(); this.currentClosingTag = new StringBuilder(); this.currentAttribute = new StringBuilder(); this.currentValue = new StringBuilder(); this.elementBuilders = new Stack<>(); } @Override public void nextState(String token) { boolean newLine = token.endsWith("\n"); if(token.isBlank()) { if (newLine) this.currentLine++; return; } this.currentState = switch (this.currentState) { case TAG -> tag(token.strip()); case CLOSE_TAG -> close_tag(token.strip()); case TEXT -> text(token); case DOCTYPE -> doctype(token.strip()); case ATTRIBUTE -> attribute(token.strip()); case ATTRIBUTE_EQUALS -> attribute_equals(token.strip()); case COMMENT -> comment(token); case VALUE -> value(token); case SCRIPT -> script(token); default -> this.currentState; }; if(newLine) this.currentLine++; } @Override public boolean finished() { return false; } public HTML getResult() { return (HTML) this.currentElement; } // Only public at the moment because of JavaScriptInterpreter public static String stripTag(String token) { return token.replace("<","").replace(">",""); } /** * Open a script * @param token script * @return next state */ private HTMLState script(String token) { this.scriptInterpreter.currentLine = this.currentLine; this.scriptInterpreter.nextState(token); // TODO: Change for release. This is debug code if(this.scriptInterpreter.finished()) { // the ScriptInterpreter already has its own ElementBuilder this.elementBuilders.pop(); this.elementBuilders.push(this.scriptInterpreter.getElementBuilder()); return this.close_tag(token); } else return HTMLState.SCRIPT; } /** * Open a tag * @param token tag * @return next state */ private HTMLState tag(String token) { String tagName = stripTag(token); boolean hasText = this.tagManager.hasText(tagName); if(tagName.equalsIgnoreCase("!DOCTYPE")) return HTMLState.DOCTYPE; else if(tagName.stripLeading().startsWith("!--")) return returnCommentState(); this.elementBuilders.push(new ElementBuilder(this.parser, tagName)); String[] split = new String[] {token}; if(token.contains(">")) split = StringUtils_Remove_Please.splitSeq(new String[]{ token.substring(0, token.indexOf('>')) }, ">"); // TODO: Change for release. This is debug code if(this.elementBuilders.peek().getTagName().equals("script")) return split.length == 1 ? HTMLState.SCRIPT : script(token.substring(token.indexOf(">")+1)); if(!token.contains(">")) return HTMLState.ATTRIBUTE; if(split.length == 1) return hasText ? HTMLState.TEXT : HTMLState.TAG; else return attribute(token.substring(token.indexOf('>')+1)); } /** * Close a tag * @param token closing tag * @return next state */ private HTMLState close_tag(String token) { System.out.println(Arrays.toString(this.elementBuilders.toArray())); this.currentClosingTag.append(token.toLowerCase().strip()); String ct = this.currentClosingTag.toString(); String tagName = this.elementBuilders.peek().getTagName(); // one instruction tags don't have a clo if(!this.tagManager.hasText(tagName)) { this.elementBuilders.pop(); return HTMLState.TEXT; } // Comments are special String should = tagName.equals("--") ? tagName + '>' : ""; System.out.println("should: " + should + " token: " + token); if(should.equals(ct)) { if(this.currentElement != null) this.currentElement = this.currentElement.append(this.elementBuilders.pop().build()); else this.currentElement = this.elementBuilders.pop().build(); if(!(this.currentElement instanceof HTML)) this.currentElement = this.currentElement.getParent(); this.currentClosingTag = new StringBuilder(); return HTMLState.TEXT; } // not reached yet else if(should.startsWith(ct)) return HTMLState.TEXT; // token not the same as else throw new UnexpectedTokenException(token, this.currentLine, this.currentState); } private HTMLState text(String token) { String strip = token.stripLeading(); // handle string begin if(this.currentText.isEmpty()) { if(strip.startsWith("<")) return tag(strip); this.currentText.append(token); return HTMLState.TEXT; } // handle string end or nested elements else if(token.startsWith("<")) { this.elementBuilders.peek().setText(this.currentText.toString()); // always reset this.currentText = new StringBuilder(); if(token.startsWith("") || token.endsWith(">")) return text(token); else if(token.contains("=")) { // Recursition if declaration and equals are same token this.currentAttribute = new StringBuilder(token.substring(0, token.indexOf('='))); return attribute_equals(token.substring(token.indexOf('='))); } else { this.currentAttribute = new StringBuilder(token); return HTMLState.ATTRIBUTE_EQUALS; } } /** * Handle equals operator between attribute declaration and definition (can only be '='; will throw otherwise) * @param token equals operator * @return next state */ private HTMLState attribute_equals(String token) { boolean dq = token.contains("\""), sq = token.contains("'"); if(dq || sq) { char quot = dq ? '"' : '\''; // Recursion if declaration and equals are same token return value(token.substring(token.indexOf(quot)-1), quot); } else if(token.equals("=")){ return HTMLState.VALUE; } else { throw new UnexpectedTokenException(token, this.currentLine, this.currentState); } } /** * Define an attribute * @param token attribute value * @return next state */ private HTMLState value(String token) { return value(token, ' '); } /** * Define a string attribute * @param token attribute value * @param quot quotation sign * @return next state */ private HTMLState value(String token, char quot) { // expected string, got other if(!token.startsWith("'") && token.startsWith("\"")) throw new ExpectStringException(token, this.currentLine, this.currentState); this.currentValue = new StringBuilder(); quot = quot != ' ' ? quot : token.charAt(0); // split by quote character String[] split = token.split(String.valueOf(quot)); for(int i = 0; i < split.length; i++) // handle escaped quote character if(split[i].endsWith("\\")) { this.currentValue.append(split[i]).append(quot); split[i] = ""; } // delete first quotation character if(!this.currentValue.isEmpty()) this.currentValue.deleteCharAt(0); StringBuilder rebuilt = new StringBuilder(); // TODO possible error source for(String s : split) if(!s.isEmpty()) rebuilt.append(s); if(!rebuilt.isEmpty() && this.tagManager.hasText(stripTag(token))) return text(token); return HTMLState.ATTRIBUTE; } /** * Comment on code * @param token comment * @return next state */ private HTMLState comment(String token) { if(this.currentText.isEmpty()) this.currentText = new StringBuilder(); // append comment if(!token.contains("-->")) { this.currentText.append(token); return HTMLState.COMMENT; } // end comment ElementBuilder elementBuilder = new ElementBuilder(this.parser, "--"); elementBuilder.setText(this.currentText.toString()); // always reset this.currentText = new StringBuilder(); this.elementBuilders.push(elementBuilder); if(token.split("-->").length == 1) return commentResetInbetween(); this.currentState = commentResetInbetween(); return close_tag( token.substring(token.indexOf("-->")) ); } /** * Define the doctype * @param token document type * @return next state */ private HTMLState doctype(String token) { String tag = stripTag(token); if(!tag.equalsIgnoreCase("HTML")) { /* Not implemented. Might do so in the future, might not. */ } if(token.endsWith(">")) return HTMLState.TEXT; else return HTMLState.DOCTYPE; } // Helper methods /** * Reset inbetween state * @return previous inbetween state */ private HTMLState commentResetInbetween() { HTMLState temp = this.inbetweenState; this.inbetweenState = HTMLState.COMMENT; return temp; } /** * Never forget to set the inbetween state! * @return HTMLState.COMMENT */ private HTMLState returnCommentState() { this.inbetweenState = this.currentState; return HTMLState.COMMENT; } /** * Continue down without returning own State * @param token next token * @return this.currentState */ private HTMLState nextTokenDontReturn(String token) { this.nextState(token); return this.currentState; } /** * Continue down without returning own State, and close the current tag * @param token next token * @return this.currentState */ private HTMLState closeTagDontReturn(String token) { this.close_tag(token); return this.currentState; } }