Files
html-parser/src/main/java/org/openautonomousconnection/htmlparser/interpreter/HTMLInterpreter.java
2025-12-13 15:15:01 +00:00

448 lines
12 KiB
Java

// Author: maple
// date: 9/24/25
package org.openautonomousconnection.htmlparser.interpreter;
import org.openautonomousconnection.StringUtils_Remove_Please;
import org.openautonomousconnection.htmlparser.Parser;
import org.openautonomousconnection.htmlparser.TagManager;
import org.openautonomousconnection.htmlparser.html.HTML;
import org.openautonomousconnection.htmlparser.html.HTMLElement;
import org.openautonomousconnection.htmlparser.interpreter.html.exception.ExpectStringException;
import org.openautonomousconnection.htmlparser.interpreter.html.exception.UnexpectedTokenException;
import org.openautonomousconnection.htmlparser.interpreter.html.state.HTMLState;
import lombok.Getter;
import org.openautonomousconnection.htmlparser.interpreter.script.ScriptInterpreter;
import java.util.Arrays;
import java.util.Map;
import java.util.Stack;
public class HTMLInterpreter implements Interpreter {
@Getter
private HTMLState currentState = HTMLState.TAG;
// Used to go up a layer after comment is opened
private HTMLState inbetweenState = HTMLState.COMMENT;
@Getter
private Parser parser;
private TagManager tagManager;
private Stack<ElementBuilder> elementBuilders;
private StringBuilder currentAttribute, currentValue, currentText, currentClosingTag;
public int currentLine = 1;
private HTMLElement currentElement;
private ScriptInterpreter scriptInterpreter;
public HTMLInterpreter(Parser parser, ScriptInterpreter scriptInterpreter) {
this.parser = parser;
this.tagManager = parser.getTagManager();
this.scriptInterpreter = scriptInterpreter;
this.currentText = new StringBuilder();
this.currentClosingTag = new StringBuilder();
this.currentAttribute = new StringBuilder();
this.currentValue = new StringBuilder();
this.elementBuilders = new Stack<>();
}
@Override
public void nextState(String token) {
boolean newLine = token.endsWith("\n");
if(token.isBlank()) {
if (newLine)
this.currentLine++;
return;
}
this.currentState = switch (this.currentState) {
case TAG -> tag(token.strip());
case CLOSE_TAG -> close_tag(token.strip());
case TEXT -> text(token);
case DOCTYPE -> doctype(token.strip());
case ATTRIBUTE -> attribute(token.strip());
case ATTRIBUTE_EQUALS -> attribute_equals(token.strip());
case COMMENT -> comment(token);
case VALUE -> value(token);
case SCRIPT -> script(token);
default -> this.currentState;
};
if(newLine)
this.currentLine++;
}
@Override
public boolean finished() {
return false;
}
public HTML getResult() {
return (HTML) this.currentElement;
}
// Only public at the moment because of JavaScriptInterpreter
public static String stripTag(String token) {
return token.replace("<","").replace(">","");
}
/**
* Open a script
* @param token script
* @return next state
*/
private HTMLState script(String token) {
this.scriptInterpreter.currentLine = this.currentLine;
this.scriptInterpreter.nextState(token);
// TODO: Change for release. This is debug code
if(this.scriptInterpreter.finished()) {
// the ScriptInterpreter already has its own ElementBuilder
this.elementBuilders.pop();
this.elementBuilders.push(this.scriptInterpreter.getElementBuilder());
return this.close_tag(token);
}
else
return HTMLState.SCRIPT;
}
/**
* Open a tag
* @param token tag
* @return next state
*/
private HTMLState tag(String token) {
String tagName = stripTag(token);
boolean hasText = this.tagManager.hasText(tagName);
if(tagName.equalsIgnoreCase("!DOCTYPE"))
return HTMLState.DOCTYPE;
else if(tagName.stripLeading().startsWith("!--"))
return returnCommentState();
this.elementBuilders.push(new ElementBuilder(this.parser, tagName));
String[] split = new String[] {token};
if(token.contains(">"))
split = StringUtils_Remove_Please.splitSeq(new String[]{
token.substring(0, token.indexOf('>'))
}, ">");
// TODO: Change for release. This is debug code
if(this.elementBuilders.peek().getTagName().equals("script"))
return split.length == 1 ? HTMLState.SCRIPT : script(token.substring(token.indexOf(">")+1));
if(!token.contains(">"))
return HTMLState.ATTRIBUTE;
if(split.length == 1)
return hasText ? HTMLState.TEXT : HTMLState.TAG;
else
return attribute(token.substring(token.indexOf('>')+1));
}
/**
* Close a tag
* @param token closing tag
* @return next state
*/
private HTMLState close_tag(String token) {
System.out.println(Arrays.toString(this.elementBuilders.toArray()));
this.currentClosingTag.append(token.toLowerCase().strip());
String ct = this.currentClosingTag.toString();
String tagName = this.elementBuilders.peek().getTagName();
// one instruction tags don't have a clo
if(!this.tagManager.hasText(tagName)) {
this.elementBuilders.pop();
return HTMLState.TEXT;
}
// Comments are special
String should = tagName.equals("--") ? tagName + '>' : "</" + tagName + ">";
System.out.println("should: " + should + " token: " + token);
if(should.equals(ct)) {
if(this.currentElement != null)
this.currentElement = this.currentElement.append(this.elementBuilders.pop().build());
else
this.currentElement = this.elementBuilders.pop().build();
if(!(this.currentElement instanceof HTML))
this.currentElement = this.currentElement.getParent();
this.currentClosingTag = new StringBuilder();
return HTMLState.TEXT;
}
// </should> not reached yet
else if(should.startsWith(ct))
return HTMLState.TEXT;
// token not the same as </should>
else
throw new UnexpectedTokenException(token, this.currentLine, this.currentState);
}
private HTMLState text(String token) {
String strip = token.stripLeading();
// handle string begin
if(this.currentText.isEmpty()) {
if(strip.startsWith("<"))
return tag(strip);
this.currentText.append(token);
return HTMLState.TEXT;
}
// handle string end or nested elements
else if(token.startsWith("<")) {
this.elementBuilders.peek().setText(this.currentText.toString());
// always reset
this.currentText = new StringBuilder();
if(token.startsWith("</"))
return close_tag(token.stripTrailing());
else
return tag(token.stripTrailing());
}
// continue as text
else {
this.currentText.append(token);
return HTMLState.TEXT;
}
}
/**
* Declare an attribute
* @param token attribute type
* @return next state
*/
private HTMLState attribute(String token) {
if(token.startsWith(">") || token.endsWith(">"))
return text(token);
else if(token.contains("=")) {
// Recursition if declaration and equals are same token
this.currentAttribute = new StringBuilder(token.substring(0, token.indexOf('=')));
return attribute_equals(token.substring(token.indexOf('=')));
}
else {
this.currentAttribute = new StringBuilder(token);
return HTMLState.ATTRIBUTE_EQUALS;
}
}
/**
* Handle equals operator between attribute declaration and definition (can only be '='; will throw otherwise)
* @param token equals operator
* @return next state
*/
private HTMLState attribute_equals(String token) {
boolean dq = token.contains("\""), sq = token.contains("'");
if(dq || sq) {
char quot = dq ? '"' : '\'';
// Recursion if declaration and equals are same token
return value(token.substring(token.indexOf(quot)-1), quot);
}
else if(token.equals("=")){
return HTMLState.VALUE;
}
else {
throw new UnexpectedTokenException(token, this.currentLine, this.currentState);
}
}
/**
* Define an attribute
* @param token attribute value
* @return next state
*/
private HTMLState value(String token) {
return value(token, ' ');
}
/**
* Define a string attribute
* @param token attribute value
* @param quot quotation sign
* @return next state
*/
private HTMLState value(String token, char quot) {
// expected string, got other
if(!token.startsWith("'") && token.startsWith("\""))
throw new ExpectStringException(token, this.currentLine, this.currentState);
this.currentValue = new StringBuilder();
quot = quot != ' ' ? quot : token.charAt(0);
// split by quote character
String[] split = token.split(String.valueOf(quot));
for(int i = 0; i < split.length; i++)
// handle escaped quote character
if(split[i].endsWith("\\")) {
this.currentValue.append(split[i]).append(quot);
split[i] = "";
}
// delete first quotation character
if(!this.currentValue.isEmpty())
this.currentValue.deleteCharAt(0);
StringBuilder rebuilt = new StringBuilder();
// TODO possible error source
for(String s : split)
if(!s.isEmpty())
rebuilt.append(s);
if(!rebuilt.isEmpty() && this.tagManager.hasText(stripTag(token)))
return text(token);
return HTMLState.ATTRIBUTE;
}
/**
* Comment on code
* @param token comment
* @return next state
*/
private HTMLState comment(String token) {
if(this.currentText.isEmpty())
this.currentText = new StringBuilder();
// append comment
if(!token.contains("-->")) {
this.currentText.append(token);
return HTMLState.COMMENT;
}
// end comment
ElementBuilder elementBuilder = new ElementBuilder(this.parser, "--");
elementBuilder.setText(this.currentText.toString());
// always reset
this.currentText = new StringBuilder();
this.elementBuilders.push(elementBuilder);
if(token.split("-->").length == 1)
return commentResetInbetween();
this.currentState = commentResetInbetween();
return close_tag(
token.substring(token.indexOf("-->"))
);
}
/**
* Define the doctype
* @param token document type
* @return next state
*/
private HTMLState doctype(String token) {
String tag = stripTag(token);
if(!tag.equalsIgnoreCase("HTML")) {
/*
Not implemented. Might do so in the future, might not.
*/
}
if(token.endsWith(">"))
return HTMLState.TEXT;
else
return HTMLState.DOCTYPE;
}
// Helper methods
/**
* Reset inbetween state
* @return previous inbetween state
*/
private HTMLState commentResetInbetween() {
HTMLState temp = this.inbetweenState;
this.inbetweenState = HTMLState.COMMENT;
return temp;
}
/**
* Never forget to set the inbetween state!
* @return HTMLState.COMMENT
*/
private HTMLState returnCommentState() {
this.inbetweenState = this.currentState;
return HTMLState.COMMENT;
}
/**
* Continue down without returning own State
* @param token next token
* @return this.currentState
*/
private HTMLState nextTokenDontReturn(String token) {
this.nextState(token);
return this.currentState;
}
/**
* Continue down without returning own State, and close the current tag
* @param token next token
* @return this.currentState
*/
private HTMLState closeTagDontReturn(String token) {
this.close_tag(token);
return this.currentState;
}
}