158 lines
3.9 KiB
Java
158 lines
3.9 KiB
Java
// Author: maple
|
|
// date: 9/24/25
|
|
|
|
package org.openautonomousconnection.htmlparser;
|
|
|
|
import dev.unlegitdqrk.unlegitlibrary.string.StringUtils;
|
|
import org.openautonomousconnection.htmlparser.html.body.misc.HTMLComment;
|
|
import lombok.Getter;
|
|
import lombok.Setter;
|
|
|
|
import java.util.ArrayList;
|
|
import java.util.List;
|
|
import java.util.regex.Matcher;
|
|
import java.util.regex.Pattern;
|
|
|
|
public class DocumentBuilder {
|
|
@Getter @Setter
|
|
protected String content;
|
|
|
|
@Getter
|
|
protected List<HTMLComment> comments;
|
|
|
|
@Getter
|
|
protected List<String> attributes, texts, tags;
|
|
|
|
public DocumentBuilder(String content) {
|
|
this.content = content; //content.replace("\n", "");
|
|
this.comments = new ArrayList<>();
|
|
this.attributes = new ArrayList<>();
|
|
this.texts = new ArrayList<>();
|
|
}
|
|
|
|
/**
|
|
* Extracts all comments and strings into lists
|
|
*/
|
|
public void extract() {
|
|
this.extractComments();
|
|
this.extractStringsAndAttributes();
|
|
this.extractTexts();
|
|
|
|
}
|
|
|
|
/**
|
|
* inserts the extracts back into the content string
|
|
*/
|
|
public void insert() {
|
|
this.insertTexts();
|
|
this.insertStringsAndAttributes();
|
|
this.insertComments();
|
|
}
|
|
|
|
protected void extractComments() {
|
|
Pattern pattern = Pattern.compile("<!--(.*?)-->", Pattern.DOTALL);
|
|
|
|
Matcher matcher = pattern.matcher(content);
|
|
|
|
|
|
int index = 0;
|
|
|
|
while (matcher.find()) {
|
|
this.content = this.content.replace("<!--" + matcher.group(1) + "-->", "<!--C" + index + "-->");
|
|
|
|
this.comments.add(new HTMLComment(matcher.group(1)));
|
|
|
|
index++;
|
|
}
|
|
}
|
|
|
|
protected void insertComments() {
|
|
if(this.comments.isEmpty())
|
|
return;
|
|
|
|
int i = 0;
|
|
for(; i < this.comments.size(); i++)
|
|
this.content = this.content.replace("<!--C" + i + "-->", this.comments.get(i).toString());
|
|
|
|
for(; i > 0; i--)
|
|
this.comments.removeFirst();
|
|
}
|
|
|
|
|
|
|
|
protected void extractStringsAndAttributes() {
|
|
Pattern pattern = Pattern.compile("\"(.*?)\"|'(.*?)'", Pattern.DOTALL);
|
|
|
|
Matcher matcher = pattern.matcher(this.content);
|
|
|
|
|
|
int index = 0;
|
|
|
|
while (matcher.find()) {
|
|
|
|
if(matcher.group(1) != null) {
|
|
this.content = this.content.replace("\"" + matcher.group(1) + "\"", "\"S" + index + "\"");
|
|
|
|
this.attributes.add(matcher.group(1));
|
|
}
|
|
|
|
else {
|
|
this.content = this.content.replace("'" + matcher.group(2) + "'", "'S" + index + "'");
|
|
|
|
this.attributes.add(matcher.group(2));
|
|
}
|
|
|
|
index++;
|
|
}
|
|
}
|
|
|
|
protected void insertStringsAndAttributes() {
|
|
if(this.attributes.isEmpty())
|
|
return;
|
|
int i = 0;
|
|
for(; i < this.attributes.size(); i++) {
|
|
this.content = this.content.replace("\"S" + i + "\"", "\"" + attributes.get(i) + "\"");
|
|
this.content = this.content.replace("'S" + i + "'", "'" + attributes.get(i) + "'");
|
|
}
|
|
|
|
for(; i > 0; i--)
|
|
this.attributes.removeFirst();
|
|
}
|
|
|
|
protected void extractTexts() {
|
|
Pattern pattern = Pattern.compile(">([^<]+)(?=<)", Pattern.DOTALL);
|
|
|
|
Matcher matcher = pattern.matcher(content);
|
|
|
|
|
|
int index = 0;
|
|
|
|
while (matcher.find()) {
|
|
if(StringUtils.isEmptyString(matcher.group(1)))
|
|
continue;
|
|
|
|
this.content = this.content.replace(">" + matcher.group(1) + "<", ">T" + index + "<");
|
|
|
|
this.texts.add(matcher.group(1));
|
|
|
|
index++;
|
|
}
|
|
}
|
|
|
|
protected void insertTexts() {
|
|
if(this.texts.isEmpty())
|
|
return;
|
|
|
|
int i = 0;
|
|
for(; i < this.texts.size(); i++)
|
|
this.content = this.content.replace(">T" + i + "<", ">" + this.texts.get(i) + "<");
|
|
|
|
for(; i > 0; i--)
|
|
this.texts.removeFirst();
|
|
}
|
|
|
|
protected void extractTags() {
|
|
|
|
}
|
|
}
|