448 lines
12 KiB
Java
448 lines
12 KiB
Java
|
|
// Author: maple
|
||
|
|
// date: 9/24/25
|
||
|
|
|
||
|
|
package org.openautonomousconnection.htmlparser.interpreter;
|
||
|
|
|
||
|
|
import org.openautonomousconnection.StringUtils_Remove_Please;
|
||
|
|
import org.openautonomousconnection.htmlparser.Parser;
|
||
|
|
import org.openautonomousconnection.htmlparser.TagManager;
|
||
|
|
import org.openautonomousconnection.htmlparser.html.HTML;
|
||
|
|
import org.openautonomousconnection.htmlparser.html.HTMLElement;
|
||
|
|
import org.openautonomousconnection.htmlparser.interpreter.html.exception.ExpectStringException;
|
||
|
|
import org.openautonomousconnection.htmlparser.interpreter.html.exception.UnexpectedTokenException;
|
||
|
|
import org.openautonomousconnection.htmlparser.interpreter.html.state.HTMLState;
|
||
|
|
import lombok.Getter;
|
||
|
|
import org.openautonomousconnection.htmlparser.interpreter.script.ScriptInterpreter;
|
||
|
|
|
||
|
|
import java.util.Arrays;
|
||
|
|
import java.util.Map;
|
||
|
|
import java.util.Stack;
|
||
|
|
|
||
|
|
public class HTMLInterpreter implements Interpreter {
|
||
|
|
@Getter
|
||
|
|
private HTMLState currentState = HTMLState.TAG;
|
||
|
|
|
||
|
|
// Used to go up a layer after comment is opened
|
||
|
|
private HTMLState inbetweenState = HTMLState.COMMENT;
|
||
|
|
|
||
|
|
@Getter
|
||
|
|
private Parser parser;
|
||
|
|
private TagManager tagManager;
|
||
|
|
private Stack<ElementBuilder> elementBuilders;
|
||
|
|
private StringBuilder currentAttribute, currentValue, currentText, currentClosingTag;
|
||
|
|
|
||
|
|
public int currentLine = 1;
|
||
|
|
|
||
|
|
private HTMLElement currentElement;
|
||
|
|
|
||
|
|
private ScriptInterpreter scriptInterpreter;
|
||
|
|
|
||
|
|
public HTMLInterpreter(Parser parser, ScriptInterpreter scriptInterpreter) {
|
||
|
|
this.parser = parser;
|
||
|
|
this.tagManager = parser.getTagManager();
|
||
|
|
this.scriptInterpreter = scriptInterpreter;
|
||
|
|
|
||
|
|
this.currentText = new StringBuilder();
|
||
|
|
this.currentClosingTag = new StringBuilder();
|
||
|
|
this.currentAttribute = new StringBuilder();
|
||
|
|
this.currentValue = new StringBuilder();
|
||
|
|
|
||
|
|
this.elementBuilders = new Stack<>();
|
||
|
|
}
|
||
|
|
|
||
|
|
@Override
|
||
|
|
public void nextState(String token) {
|
||
|
|
boolean newLine = token.endsWith("\n");
|
||
|
|
|
||
|
|
if(token.isBlank()) {
|
||
|
|
if (newLine)
|
||
|
|
this.currentLine++;
|
||
|
|
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
|
||
|
|
this.currentState = switch (this.currentState) {
|
||
|
|
case TAG -> tag(token.strip());
|
||
|
|
case CLOSE_TAG -> close_tag(token.strip());
|
||
|
|
case TEXT -> text(token);
|
||
|
|
case DOCTYPE -> doctype(token.strip());
|
||
|
|
case ATTRIBUTE -> attribute(token.strip());
|
||
|
|
case ATTRIBUTE_EQUALS -> attribute_equals(token.strip());
|
||
|
|
case COMMENT -> comment(token);
|
||
|
|
case VALUE -> value(token);
|
||
|
|
case SCRIPT -> script(token);
|
||
|
|
default -> this.currentState;
|
||
|
|
};
|
||
|
|
|
||
|
|
if(newLine)
|
||
|
|
this.currentLine++;
|
||
|
|
}
|
||
|
|
|
||
|
|
@Override
|
||
|
|
public boolean finished() {
|
||
|
|
return false;
|
||
|
|
}
|
||
|
|
|
||
|
|
public HTML getResult() {
|
||
|
|
return (HTML) this.currentElement;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Only public at the moment because of JavaScriptInterpreter
|
||
|
|
public static String stripTag(String token) {
|
||
|
|
return token.replace("<","").replace(">","");
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Open a script
|
||
|
|
* @param token script
|
||
|
|
* @return next state
|
||
|
|
*/
|
||
|
|
private HTMLState script(String token) {
|
||
|
|
this.scriptInterpreter.currentLine = this.currentLine;
|
||
|
|
|
||
|
|
this.scriptInterpreter.nextState(token);
|
||
|
|
|
||
|
|
// TODO: Change for release. This is debug code
|
||
|
|
if(this.scriptInterpreter.finished()) {
|
||
|
|
// the ScriptInterpreter already has its own ElementBuilder
|
||
|
|
this.elementBuilders.pop();
|
||
|
|
|
||
|
|
this.elementBuilders.push(this.scriptInterpreter.getElementBuilder());
|
||
|
|
|
||
|
|
return this.close_tag(token);
|
||
|
|
}
|
||
|
|
else
|
||
|
|
return HTMLState.SCRIPT;
|
||
|
|
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Open a tag
|
||
|
|
* @param token tag
|
||
|
|
* @return next state
|
||
|
|
*/
|
||
|
|
private HTMLState tag(String token) {
|
||
|
|
String tagName = stripTag(token);
|
||
|
|
|
||
|
|
boolean hasText = this.tagManager.hasText(tagName);
|
||
|
|
|
||
|
|
if(tagName.equalsIgnoreCase("!DOCTYPE"))
|
||
|
|
return HTMLState.DOCTYPE;
|
||
|
|
|
||
|
|
else if(tagName.stripLeading().startsWith("!--"))
|
||
|
|
return returnCommentState();
|
||
|
|
|
||
|
|
this.elementBuilders.push(new ElementBuilder(this.parser, tagName));
|
||
|
|
|
||
|
|
|
||
|
|
String[] split = new String[] {token};
|
||
|
|
|
||
|
|
if(token.contains(">"))
|
||
|
|
split = StringUtils_Remove_Please.splitSeq(new String[]{
|
||
|
|
token.substring(0, token.indexOf('>'))
|
||
|
|
}, ">");
|
||
|
|
|
||
|
|
|
||
|
|
// TODO: Change for release. This is debug code
|
||
|
|
if(this.elementBuilders.peek().getTagName().equals("script"))
|
||
|
|
return split.length == 1 ? HTMLState.SCRIPT : script(token.substring(token.indexOf(">")+1));
|
||
|
|
|
||
|
|
|
||
|
|
if(!token.contains(">"))
|
||
|
|
return HTMLState.ATTRIBUTE;
|
||
|
|
|
||
|
|
if(split.length == 1)
|
||
|
|
return hasText ? HTMLState.TEXT : HTMLState.TAG;
|
||
|
|
else
|
||
|
|
return attribute(token.substring(token.indexOf('>')+1));
|
||
|
|
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Close a tag
|
||
|
|
* @param token closing tag
|
||
|
|
* @return next state
|
||
|
|
*/
|
||
|
|
private HTMLState close_tag(String token) {
|
||
|
|
System.out.println(Arrays.toString(this.elementBuilders.toArray()));
|
||
|
|
this.currentClosingTag.append(token.toLowerCase().strip());
|
||
|
|
|
||
|
|
String ct = this.currentClosingTag.toString();
|
||
|
|
|
||
|
|
String tagName = this.elementBuilders.peek().getTagName();
|
||
|
|
|
||
|
|
// one instruction tags don't have a clo
|
||
|
|
if(!this.tagManager.hasText(tagName)) {
|
||
|
|
this.elementBuilders.pop();
|
||
|
|
|
||
|
|
return HTMLState.TEXT;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Comments are special
|
||
|
|
String should = tagName.equals("--") ? tagName + '>' : "</" + tagName + ">";
|
||
|
|
|
||
|
|
System.out.println("should: " + should + " token: " + token);
|
||
|
|
|
||
|
|
if(should.equals(ct)) {
|
||
|
|
|
||
|
|
if(this.currentElement != null)
|
||
|
|
this.currentElement = this.currentElement.append(this.elementBuilders.pop().build());
|
||
|
|
else
|
||
|
|
this.currentElement = this.elementBuilders.pop().build();
|
||
|
|
|
||
|
|
if(!(this.currentElement instanceof HTML))
|
||
|
|
this.currentElement = this.currentElement.getParent();
|
||
|
|
|
||
|
|
this.currentClosingTag = new StringBuilder();
|
||
|
|
return HTMLState.TEXT;
|
||
|
|
|
||
|
|
}
|
||
|
|
|
||
|
|
// </should> not reached yet
|
||
|
|
else if(should.startsWith(ct))
|
||
|
|
return HTMLState.TEXT;
|
||
|
|
|
||
|
|
// token not the same as </should>
|
||
|
|
else
|
||
|
|
throw new UnexpectedTokenException(token, this.currentLine, this.currentState);
|
||
|
|
|
||
|
|
}
|
||
|
|
|
||
|
|
private HTMLState text(String token) {
|
||
|
|
String strip = token.stripLeading();
|
||
|
|
|
||
|
|
// handle string begin
|
||
|
|
if(this.currentText.isEmpty()) {
|
||
|
|
if(strip.startsWith("<"))
|
||
|
|
return tag(strip);
|
||
|
|
|
||
|
|
this.currentText.append(token);
|
||
|
|
|
||
|
|
return HTMLState.TEXT;
|
||
|
|
}
|
||
|
|
|
||
|
|
// handle string end or nested elements
|
||
|
|
else if(token.startsWith("<")) {
|
||
|
|
this.elementBuilders.peek().setText(this.currentText.toString());
|
||
|
|
|
||
|
|
// always reset
|
||
|
|
this.currentText = new StringBuilder();
|
||
|
|
|
||
|
|
if(token.startsWith("</"))
|
||
|
|
return close_tag(token.stripTrailing());
|
||
|
|
else
|
||
|
|
return tag(token.stripTrailing());
|
||
|
|
}
|
||
|
|
|
||
|
|
// continue as text
|
||
|
|
|
||
|
|
else {
|
||
|
|
this.currentText.append(token);
|
||
|
|
return HTMLState.TEXT;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Declare an attribute
|
||
|
|
* @param token attribute type
|
||
|
|
* @return next state
|
||
|
|
*/
|
||
|
|
private HTMLState attribute(String token) {
|
||
|
|
if(token.startsWith(">") || token.endsWith(">"))
|
||
|
|
return text(token);
|
||
|
|
|
||
|
|
else if(token.contains("=")) {
|
||
|
|
// Recursition if declaration and equals are same token
|
||
|
|
this.currentAttribute = new StringBuilder(token.substring(0, token.indexOf('=')));
|
||
|
|
|
||
|
|
return attribute_equals(token.substring(token.indexOf('=')));
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
this.currentAttribute = new StringBuilder(token);
|
||
|
|
return HTMLState.ATTRIBUTE_EQUALS;
|
||
|
|
|
||
|
|
}
|
||
|
|
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Handle equals operator between attribute declaration and definition (can only be '='; will throw otherwise)
|
||
|
|
* @param token equals operator
|
||
|
|
* @return next state
|
||
|
|
*/
|
||
|
|
private HTMLState attribute_equals(String token) {
|
||
|
|
boolean dq = token.contains("\""), sq = token.contains("'");
|
||
|
|
if(dq || sq) {
|
||
|
|
char quot = dq ? '"' : '\'';
|
||
|
|
// Recursion if declaration and equals are same token
|
||
|
|
|
||
|
|
return value(token.substring(token.indexOf(quot)-1), quot);
|
||
|
|
}
|
||
|
|
else if(token.equals("=")){
|
||
|
|
return HTMLState.VALUE;
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
throw new UnexpectedTokenException(token, this.currentLine, this.currentState);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Define an attribute
|
||
|
|
* @param token attribute value
|
||
|
|
* @return next state
|
||
|
|
*/
|
||
|
|
private HTMLState value(String token) {
|
||
|
|
return value(token, ' ');
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Define a string attribute
|
||
|
|
* @param token attribute value
|
||
|
|
* @param quot quotation sign
|
||
|
|
* @return next state
|
||
|
|
*/
|
||
|
|
private HTMLState value(String token, char quot) {
|
||
|
|
|
||
|
|
// expected string, got other
|
||
|
|
if(!token.startsWith("'") && token.startsWith("\""))
|
||
|
|
throw new ExpectStringException(token, this.currentLine, this.currentState);
|
||
|
|
|
||
|
|
this.currentValue = new StringBuilder();
|
||
|
|
|
||
|
|
quot = quot != ' ' ? quot : token.charAt(0);
|
||
|
|
|
||
|
|
// split by quote character
|
||
|
|
String[] split = token.split(String.valueOf(quot));
|
||
|
|
|
||
|
|
for(int i = 0; i < split.length; i++)
|
||
|
|
|
||
|
|
// handle escaped quote character
|
||
|
|
if(split[i].endsWith("\\")) {
|
||
|
|
this.currentValue.append(split[i]).append(quot);
|
||
|
|
split[i] = "";
|
||
|
|
}
|
||
|
|
|
||
|
|
// delete first quotation character
|
||
|
|
if(!this.currentValue.isEmpty())
|
||
|
|
this.currentValue.deleteCharAt(0);
|
||
|
|
|
||
|
|
StringBuilder rebuilt = new StringBuilder();
|
||
|
|
|
||
|
|
// TODO possible error source
|
||
|
|
|
||
|
|
for(String s : split)
|
||
|
|
|
||
|
|
if(!s.isEmpty())
|
||
|
|
rebuilt.append(s);
|
||
|
|
|
||
|
|
if(!rebuilt.isEmpty() && this.tagManager.hasText(stripTag(token)))
|
||
|
|
return text(token);
|
||
|
|
|
||
|
|
return HTMLState.ATTRIBUTE;
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Comment on code
|
||
|
|
* @param token comment
|
||
|
|
* @return next state
|
||
|
|
*/
|
||
|
|
private HTMLState comment(String token) {
|
||
|
|
if(this.currentText.isEmpty())
|
||
|
|
this.currentText = new StringBuilder();
|
||
|
|
|
||
|
|
// append comment
|
||
|
|
if(!token.contains("-->")) {
|
||
|
|
this.currentText.append(token);
|
||
|
|
|
||
|
|
return HTMLState.COMMENT;
|
||
|
|
}
|
||
|
|
|
||
|
|
// end comment
|
||
|
|
|
||
|
|
ElementBuilder elementBuilder = new ElementBuilder(this.parser, "--");
|
||
|
|
elementBuilder.setText(this.currentText.toString());
|
||
|
|
|
||
|
|
// always reset
|
||
|
|
this.currentText = new StringBuilder();
|
||
|
|
|
||
|
|
this.elementBuilders.push(elementBuilder);
|
||
|
|
|
||
|
|
if(token.split("-->").length == 1)
|
||
|
|
return commentResetInbetween();
|
||
|
|
|
||
|
|
|
||
|
|
this.currentState = commentResetInbetween();
|
||
|
|
|
||
|
|
return close_tag(
|
||
|
|
token.substring(token.indexOf("-->"))
|
||
|
|
);
|
||
|
|
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Define the doctype
|
||
|
|
* @param token document type
|
||
|
|
* @return next state
|
||
|
|
*/
|
||
|
|
private HTMLState doctype(String token) {
|
||
|
|
String tag = stripTag(token);
|
||
|
|
if(!tag.equalsIgnoreCase("HTML")) {
|
||
|
|
/*
|
||
|
|
Not implemented. Might do so in the future, might not.
|
||
|
|
*/
|
||
|
|
}
|
||
|
|
|
||
|
|
if(token.endsWith(">"))
|
||
|
|
return HTMLState.TEXT;
|
||
|
|
else
|
||
|
|
return HTMLState.DOCTYPE;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Helper methods
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Reset inbetween state
|
||
|
|
* @return previous inbetween state
|
||
|
|
*/
|
||
|
|
private HTMLState commentResetInbetween() {
|
||
|
|
HTMLState temp = this.inbetweenState;
|
||
|
|
|
||
|
|
this.inbetweenState = HTMLState.COMMENT;
|
||
|
|
|
||
|
|
return temp;
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Never forget to set the inbetween state!
|
||
|
|
* @return HTMLState.COMMENT
|
||
|
|
*/
|
||
|
|
private HTMLState returnCommentState() {
|
||
|
|
this.inbetweenState = this.currentState;
|
||
|
|
return HTMLState.COMMENT;
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Continue down without returning own State
|
||
|
|
* @param token next token
|
||
|
|
* @return this.currentState
|
||
|
|
*/
|
||
|
|
private HTMLState nextTokenDontReturn(String token) {
|
||
|
|
this.nextState(token);
|
||
|
|
|
||
|
|
return this.currentState;
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Continue down without returning own State, and close the current tag
|
||
|
|
* @param token next token
|
||
|
|
* @return this.currentState
|
||
|
|
*/
|
||
|
|
private HTMLState closeTagDontReturn(String token) {
|
||
|
|
this.close_tag(token);
|
||
|
|
|
||
|
|
return this.currentState;
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
}
|