first commit
This commit was merged in pull request #1.
This commit is contained in:
@@ -0,0 +1,447 @@
|
||||
// Author: maple
|
||||
// date: 9/24/25
|
||||
|
||||
package org.openautonomousconnection.htmlparser.interpreter;
|
||||
|
||||
import org.openautonomousconnection.StringUtils_Remove_Please;
|
||||
import org.openautonomousconnection.htmlparser.Parser;
|
||||
import org.openautonomousconnection.htmlparser.TagManager;
|
||||
import org.openautonomousconnection.htmlparser.html.HTML;
|
||||
import org.openautonomousconnection.htmlparser.html.HTMLElement;
|
||||
import org.openautonomousconnection.htmlparser.interpreter.html.exception.ExpectStringException;
|
||||
import org.openautonomousconnection.htmlparser.interpreter.html.exception.UnexpectedTokenException;
|
||||
import org.openautonomousconnection.htmlparser.interpreter.html.state.HTMLState;
|
||||
import lombok.Getter;
|
||||
import org.openautonomousconnection.htmlparser.interpreter.script.ScriptInterpreter;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.Map;
|
||||
import java.util.Stack;
|
||||
|
||||
public class HTMLInterpreter implements Interpreter {
|
||||
@Getter
|
||||
private HTMLState currentState = HTMLState.TAG;
|
||||
|
||||
// Used to go up a layer after comment is opened
|
||||
private HTMLState inbetweenState = HTMLState.COMMENT;
|
||||
|
||||
@Getter
|
||||
private Parser parser;
|
||||
private TagManager tagManager;
|
||||
private Stack<ElementBuilder> elementBuilders;
|
||||
private StringBuilder currentAttribute, currentValue, currentText, currentClosingTag;
|
||||
|
||||
public int currentLine = 1;
|
||||
|
||||
private HTMLElement currentElement;
|
||||
|
||||
private ScriptInterpreter scriptInterpreter;
|
||||
|
||||
public HTMLInterpreter(Parser parser, ScriptInterpreter scriptInterpreter) {
|
||||
this.parser = parser;
|
||||
this.tagManager = parser.getTagManager();
|
||||
this.scriptInterpreter = scriptInterpreter;
|
||||
|
||||
this.currentText = new StringBuilder();
|
||||
this.currentClosingTag = new StringBuilder();
|
||||
this.currentAttribute = new StringBuilder();
|
||||
this.currentValue = new StringBuilder();
|
||||
|
||||
this.elementBuilders = new Stack<>();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void nextState(String token) {
|
||||
boolean newLine = token.endsWith("\n");
|
||||
|
||||
if(token.isBlank()) {
|
||||
if (newLine)
|
||||
this.currentLine++;
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
this.currentState = switch (this.currentState) {
|
||||
case TAG -> tag(token.strip());
|
||||
case CLOSE_TAG -> close_tag(token.strip());
|
||||
case TEXT -> text(token);
|
||||
case DOCTYPE -> doctype(token.strip());
|
||||
case ATTRIBUTE -> attribute(token.strip());
|
||||
case ATTRIBUTE_EQUALS -> attribute_equals(token.strip());
|
||||
case COMMENT -> comment(token);
|
||||
case VALUE -> value(token);
|
||||
case SCRIPT -> script(token);
|
||||
default -> this.currentState;
|
||||
};
|
||||
|
||||
if(newLine)
|
||||
this.currentLine++;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean finished() {
|
||||
return false;
|
||||
}
|
||||
|
||||
public HTML getResult() {
|
||||
return (HTML) this.currentElement;
|
||||
}
|
||||
|
||||
// Only public at the moment because of JavaScriptInterpreter
|
||||
public static String stripTag(String token) {
|
||||
return token.replace("<","").replace(">","");
|
||||
}
|
||||
|
||||
/**
|
||||
* Open a script
|
||||
* @param token script
|
||||
* @return next state
|
||||
*/
|
||||
private HTMLState script(String token) {
|
||||
this.scriptInterpreter.currentLine = this.currentLine;
|
||||
|
||||
this.scriptInterpreter.nextState(token);
|
||||
|
||||
// TODO: Change for release. This is debug code
|
||||
if(this.scriptInterpreter.finished()) {
|
||||
// the ScriptInterpreter already has its own ElementBuilder
|
||||
this.elementBuilders.pop();
|
||||
|
||||
this.elementBuilders.push(this.scriptInterpreter.getElementBuilder());
|
||||
|
||||
return this.close_tag(token);
|
||||
}
|
||||
else
|
||||
return HTMLState.SCRIPT;
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Open a tag
|
||||
* @param token tag
|
||||
* @return next state
|
||||
*/
|
||||
private HTMLState tag(String token) {
|
||||
String tagName = stripTag(token);
|
||||
|
||||
boolean hasText = this.tagManager.hasText(tagName);
|
||||
|
||||
if(tagName.equalsIgnoreCase("!DOCTYPE"))
|
||||
return HTMLState.DOCTYPE;
|
||||
|
||||
else if(tagName.stripLeading().startsWith("!--"))
|
||||
return returnCommentState();
|
||||
|
||||
this.elementBuilders.push(new ElementBuilder(this.parser, tagName));
|
||||
|
||||
|
||||
String[] split = new String[] {token};
|
||||
|
||||
if(token.contains(">"))
|
||||
split = StringUtils_Remove_Please.splitSeq(new String[]{
|
||||
token.substring(0, token.indexOf('>'))
|
||||
}, ">");
|
||||
|
||||
|
||||
// TODO: Change for release. This is debug code
|
||||
if(this.elementBuilders.peek().getTagName().equals("script"))
|
||||
return split.length == 1 ? HTMLState.SCRIPT : script(token.substring(token.indexOf(">")+1));
|
||||
|
||||
|
||||
if(!token.contains(">"))
|
||||
return HTMLState.ATTRIBUTE;
|
||||
|
||||
if(split.length == 1)
|
||||
return hasText ? HTMLState.TEXT : HTMLState.TAG;
|
||||
else
|
||||
return attribute(token.substring(token.indexOf('>')+1));
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Close a tag
|
||||
* @param token closing tag
|
||||
* @return next state
|
||||
*/
|
||||
private HTMLState close_tag(String token) {
|
||||
System.out.println(Arrays.toString(this.elementBuilders.toArray()));
|
||||
this.currentClosingTag.append(token.toLowerCase().strip());
|
||||
|
||||
String ct = this.currentClosingTag.toString();
|
||||
|
||||
String tagName = this.elementBuilders.peek().getTagName();
|
||||
|
||||
// one instruction tags don't have a clo
|
||||
if(!this.tagManager.hasText(tagName)) {
|
||||
this.elementBuilders.pop();
|
||||
|
||||
return HTMLState.TEXT;
|
||||
}
|
||||
|
||||
// Comments are special
|
||||
String should = tagName.equals("--") ? tagName + '>' : "</" + tagName + ">";
|
||||
|
||||
System.out.println("should: " + should + " token: " + token);
|
||||
|
||||
if(should.equals(ct)) {
|
||||
|
||||
if(this.currentElement != null)
|
||||
this.currentElement = this.currentElement.append(this.elementBuilders.pop().build());
|
||||
else
|
||||
this.currentElement = this.elementBuilders.pop().build();
|
||||
|
||||
if(!(this.currentElement instanceof HTML))
|
||||
this.currentElement = this.currentElement.getParent();
|
||||
|
||||
this.currentClosingTag = new StringBuilder();
|
||||
return HTMLState.TEXT;
|
||||
|
||||
}
|
||||
|
||||
// </should> not reached yet
|
||||
else if(should.startsWith(ct))
|
||||
return HTMLState.TEXT;
|
||||
|
||||
// token not the same as </should>
|
||||
else
|
||||
throw new UnexpectedTokenException(token, this.currentLine, this.currentState);
|
||||
|
||||
}
|
||||
|
||||
private HTMLState text(String token) {
|
||||
String strip = token.stripLeading();
|
||||
|
||||
// handle string begin
|
||||
if(this.currentText.isEmpty()) {
|
||||
if(strip.startsWith("<"))
|
||||
return tag(strip);
|
||||
|
||||
this.currentText.append(token);
|
||||
|
||||
return HTMLState.TEXT;
|
||||
}
|
||||
|
||||
// handle string end or nested elements
|
||||
else if(token.startsWith("<")) {
|
||||
this.elementBuilders.peek().setText(this.currentText.toString());
|
||||
|
||||
// always reset
|
||||
this.currentText = new StringBuilder();
|
||||
|
||||
if(token.startsWith("</"))
|
||||
return close_tag(token.stripTrailing());
|
||||
else
|
||||
return tag(token.stripTrailing());
|
||||
}
|
||||
|
||||
// continue as text
|
||||
|
||||
else {
|
||||
this.currentText.append(token);
|
||||
return HTMLState.TEXT;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Declare an attribute
|
||||
* @param token attribute type
|
||||
* @return next state
|
||||
*/
|
||||
private HTMLState attribute(String token) {
|
||||
if(token.startsWith(">") || token.endsWith(">"))
|
||||
return text(token);
|
||||
|
||||
else if(token.contains("=")) {
|
||||
// Recursition if declaration and equals are same token
|
||||
this.currentAttribute = new StringBuilder(token.substring(0, token.indexOf('=')));
|
||||
|
||||
return attribute_equals(token.substring(token.indexOf('=')));
|
||||
}
|
||||
else {
|
||||
this.currentAttribute = new StringBuilder(token);
|
||||
return HTMLState.ATTRIBUTE_EQUALS;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Handle equals operator between attribute declaration and definition (can only be '='; will throw otherwise)
|
||||
* @param token equals operator
|
||||
* @return next state
|
||||
*/
|
||||
private HTMLState attribute_equals(String token) {
|
||||
boolean dq = token.contains("\""), sq = token.contains("'");
|
||||
if(dq || sq) {
|
||||
char quot = dq ? '"' : '\'';
|
||||
// Recursion if declaration and equals are same token
|
||||
|
||||
return value(token.substring(token.indexOf(quot)-1), quot);
|
||||
}
|
||||
else if(token.equals("=")){
|
||||
return HTMLState.VALUE;
|
||||
}
|
||||
else {
|
||||
throw new UnexpectedTokenException(token, this.currentLine, this.currentState);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Define an attribute
|
||||
* @param token attribute value
|
||||
* @return next state
|
||||
*/
|
||||
private HTMLState value(String token) {
|
||||
return value(token, ' ');
|
||||
}
|
||||
|
||||
/**
|
||||
* Define a string attribute
|
||||
* @param token attribute value
|
||||
* @param quot quotation sign
|
||||
* @return next state
|
||||
*/
|
||||
private HTMLState value(String token, char quot) {
|
||||
|
||||
// expected string, got other
|
||||
if(!token.startsWith("'") && token.startsWith("\""))
|
||||
throw new ExpectStringException(token, this.currentLine, this.currentState);
|
||||
|
||||
this.currentValue = new StringBuilder();
|
||||
|
||||
quot = quot != ' ' ? quot : token.charAt(0);
|
||||
|
||||
// split by quote character
|
||||
String[] split = token.split(String.valueOf(quot));
|
||||
|
||||
for(int i = 0; i < split.length; i++)
|
||||
|
||||
// handle escaped quote character
|
||||
if(split[i].endsWith("\\")) {
|
||||
this.currentValue.append(split[i]).append(quot);
|
||||
split[i] = "";
|
||||
}
|
||||
|
||||
// delete first quotation character
|
||||
if(!this.currentValue.isEmpty())
|
||||
this.currentValue.deleteCharAt(0);
|
||||
|
||||
StringBuilder rebuilt = new StringBuilder();
|
||||
|
||||
// TODO possible error source
|
||||
|
||||
for(String s : split)
|
||||
|
||||
if(!s.isEmpty())
|
||||
rebuilt.append(s);
|
||||
|
||||
if(!rebuilt.isEmpty() && this.tagManager.hasText(stripTag(token)))
|
||||
return text(token);
|
||||
|
||||
return HTMLState.ATTRIBUTE;
|
||||
}
|
||||
|
||||
/**
|
||||
* Comment on code
|
||||
* @param token comment
|
||||
* @return next state
|
||||
*/
|
||||
private HTMLState comment(String token) {
|
||||
if(this.currentText.isEmpty())
|
||||
this.currentText = new StringBuilder();
|
||||
|
||||
// append comment
|
||||
if(!token.contains("-->")) {
|
||||
this.currentText.append(token);
|
||||
|
||||
return HTMLState.COMMENT;
|
||||
}
|
||||
|
||||
// end comment
|
||||
|
||||
ElementBuilder elementBuilder = new ElementBuilder(this.parser, "--");
|
||||
elementBuilder.setText(this.currentText.toString());
|
||||
|
||||
// always reset
|
||||
this.currentText = new StringBuilder();
|
||||
|
||||
this.elementBuilders.push(elementBuilder);
|
||||
|
||||
if(token.split("-->").length == 1)
|
||||
return commentResetInbetween();
|
||||
|
||||
|
||||
this.currentState = commentResetInbetween();
|
||||
|
||||
return close_tag(
|
||||
token.substring(token.indexOf("-->"))
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Define the doctype
|
||||
* @param token document type
|
||||
* @return next state
|
||||
*/
|
||||
private HTMLState doctype(String token) {
|
||||
String tag = stripTag(token);
|
||||
if(!tag.equalsIgnoreCase("HTML")) {
|
||||
/*
|
||||
Not implemented. Might do so in the future, might not.
|
||||
*/
|
||||
}
|
||||
|
||||
if(token.endsWith(">"))
|
||||
return HTMLState.TEXT;
|
||||
else
|
||||
return HTMLState.DOCTYPE;
|
||||
}
|
||||
|
||||
// Helper methods
|
||||
|
||||
/**
|
||||
* Reset inbetween state
|
||||
* @return previous inbetween state
|
||||
*/
|
||||
private HTMLState commentResetInbetween() {
|
||||
HTMLState temp = this.inbetweenState;
|
||||
|
||||
this.inbetweenState = HTMLState.COMMENT;
|
||||
|
||||
return temp;
|
||||
}
|
||||
|
||||
/**
|
||||
* Never forget to set the inbetween state!
|
||||
* @return HTMLState.COMMENT
|
||||
*/
|
||||
private HTMLState returnCommentState() {
|
||||
this.inbetweenState = this.currentState;
|
||||
return HTMLState.COMMENT;
|
||||
}
|
||||
|
||||
/**
|
||||
* Continue down without returning own State
|
||||
* @param token next token
|
||||
* @return this.currentState
|
||||
*/
|
||||
private HTMLState nextTokenDontReturn(String token) {
|
||||
this.nextState(token);
|
||||
|
||||
return this.currentState;
|
||||
}
|
||||
|
||||
/**
|
||||
* Continue down without returning own State, and close the current tag
|
||||
* @param token next token
|
||||
* @return this.currentState
|
||||
*/
|
||||
private HTMLState closeTagDontReturn(String token) {
|
||||
this.close_tag(token);
|
||||
|
||||
return this.currentState;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
Reference in New Issue
Block a user