Files
html-parser/src/main/java/org/openautonomousconnection/htmlparser/DocumentBuilder.java

158 lines
3.9 KiB
Java
Raw Normal View History

2025-12-13 16:12:41 +01:00
// Author: maple
// date: 9/24/25
package org.openautonomousconnection.htmlparser;
import dev.unlegitdqrk.unlegitlibrary.string.StringUtils;
import org.openautonomousconnection.htmlparser.html.body.misc.HTMLComment;
import lombok.Getter;
import lombok.Setter;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class DocumentBuilder {
@Getter @Setter
protected String content;
@Getter
protected List<HTMLComment> comments;
@Getter
protected List<String> attributes, texts, tags;
public DocumentBuilder(String content) {
this.content = content; //content.replace("\n", "");
this.comments = new ArrayList<>();
this.attributes = new ArrayList<>();
this.texts = new ArrayList<>();
}
/**
* Extracts all comments and strings into lists
*/
public void extract() {
this.extractComments();
this.extractStringsAndAttributes();
this.extractTexts();
}
/**
* inserts the extracts back into the content string
*/
public void insert() {
this.insertTexts();
this.insertStringsAndAttributes();
this.insertComments();
}
protected void extractComments() {
Pattern pattern = Pattern.compile("<!--(.*?)-->", Pattern.DOTALL);
Matcher matcher = pattern.matcher(content);
int index = 0;
while (matcher.find()) {
this.content = this.content.replace("<!--" + matcher.group(1) + "-->", "<!--C" + index + "-->");
this.comments.add(new HTMLComment(matcher.group(1)));
index++;
}
}
protected void insertComments() {
if(this.comments.isEmpty())
return;
int i = 0;
for(; i < this.comments.size(); i++)
this.content = this.content.replace("<!--C" + i + "-->", this.comments.get(i).toString());
for(; i > 0; i--)
this.comments.removeFirst();
}
protected void extractStringsAndAttributes() {
Pattern pattern = Pattern.compile("\"(.*?)\"|'(.*?)'", Pattern.DOTALL);
Matcher matcher = pattern.matcher(this.content);
int index = 0;
while (matcher.find()) {
if(matcher.group(1) != null) {
this.content = this.content.replace("\"" + matcher.group(1) + "\"", "\"S" + index + "\"");
this.attributes.add(matcher.group(1));
}
else {
this.content = this.content.replace("'" + matcher.group(2) + "'", "'S" + index + "'");
this.attributes.add(matcher.group(2));
}
index++;
}
}
protected void insertStringsAndAttributes() {
if(this.attributes.isEmpty())
return;
int i = 0;
for(; i < this.attributes.size(); i++) {
this.content = this.content.replace("\"S" + i + "\"", "\"" + attributes.get(i) + "\"");
this.content = this.content.replace("'S" + i + "'", "'" + attributes.get(i) + "'");
}
for(; i > 0; i--)
this.attributes.removeFirst();
}
protected void extractTexts() {
Pattern pattern = Pattern.compile(">([^<]+)(?=<)", Pattern.DOTALL);
Matcher matcher = pattern.matcher(content);
int index = 0;
while (matcher.find()) {
if(StringUtils.isEmptyString(matcher.group(1)))
continue;
this.content = this.content.replace(">" + matcher.group(1) + "<", ">T" + index + "<");
this.texts.add(matcher.group(1));
index++;
}
}
protected void insertTexts() {
if(this.texts.isEmpty())
return;
int i = 0;
for(; i < this.texts.size(); i++)
this.content = this.content.replace(">T" + i + "<", ">" + this.texts.get(i) + "<");
for(; i > 0; i--)
this.texts.removeFirst();
}
protected void extractTags() {
}
}