first commit
This commit is contained in:
@@ -0,0 +1,157 @@
|
||||
// Author: maple
|
||||
// date: 9/24/25
|
||||
|
||||
package org.openautonomousconnection.htmlparser;
|
||||
|
||||
import dev.unlegitdqrk.unlegitlibrary.string.StringUtils;
|
||||
import org.openautonomousconnection.htmlparser.html.body.misc.HTMLComment;
|
||||
import lombok.Getter;
|
||||
import lombok.Setter;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
public class DocumentBuilder {
|
||||
@Getter @Setter
|
||||
protected String content;
|
||||
|
||||
@Getter
|
||||
protected List<HTMLComment> comments;
|
||||
|
||||
@Getter
|
||||
protected List<String> attributes, texts, tags;
|
||||
|
||||
public DocumentBuilder(String content) {
|
||||
this.content = content; //content.replace("\n", "");
|
||||
this.comments = new ArrayList<>();
|
||||
this.attributes = new ArrayList<>();
|
||||
this.texts = new ArrayList<>();
|
||||
}
|
||||
|
||||
/**
|
||||
* Extracts all comments and strings into lists
|
||||
*/
|
||||
public void extract() {
|
||||
this.extractComments();
|
||||
this.extractStringsAndAttributes();
|
||||
this.extractTexts();
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* inserts the extracts back into the content string
|
||||
*/
|
||||
public void insert() {
|
||||
this.insertTexts();
|
||||
this.insertStringsAndAttributes();
|
||||
this.insertComments();
|
||||
}
|
||||
|
||||
protected void extractComments() {
|
||||
Pattern pattern = Pattern.compile("<!--(.*?)-->", Pattern.DOTALL);
|
||||
|
||||
Matcher matcher = pattern.matcher(content);
|
||||
|
||||
|
||||
int index = 0;
|
||||
|
||||
while (matcher.find()) {
|
||||
this.content = this.content.replace("<!--" + matcher.group(1) + "-->", "<!--C" + index + "-->");
|
||||
|
||||
this.comments.add(new HTMLComment(matcher.group(1)));
|
||||
|
||||
index++;
|
||||
}
|
||||
}
|
||||
|
||||
protected void insertComments() {
|
||||
if(this.comments.isEmpty())
|
||||
return;
|
||||
|
||||
int i = 0;
|
||||
for(; i < this.comments.size(); i++)
|
||||
this.content = this.content.replace("<!--C" + i + "-->", this.comments.get(i).toString());
|
||||
|
||||
for(; i > 0; i--)
|
||||
this.comments.removeFirst();
|
||||
}
|
||||
|
||||
|
||||
|
||||
protected void extractStringsAndAttributes() {
|
||||
Pattern pattern = Pattern.compile("\"(.*?)\"|'(.*?)'", Pattern.DOTALL);
|
||||
|
||||
Matcher matcher = pattern.matcher(this.content);
|
||||
|
||||
|
||||
int index = 0;
|
||||
|
||||
while (matcher.find()) {
|
||||
|
||||
if(matcher.group(1) != null) {
|
||||
this.content = this.content.replace("\"" + matcher.group(1) + "\"", "\"S" + index + "\"");
|
||||
|
||||
this.attributes.add(matcher.group(1));
|
||||
}
|
||||
|
||||
else {
|
||||
this.content = this.content.replace("'" + matcher.group(2) + "'", "'S" + index + "'");
|
||||
|
||||
this.attributes.add(matcher.group(2));
|
||||
}
|
||||
|
||||
index++;
|
||||
}
|
||||
}
|
||||
|
||||
protected void insertStringsAndAttributes() {
|
||||
if(this.attributes.isEmpty())
|
||||
return;
|
||||
int i = 0;
|
||||
for(; i < this.attributes.size(); i++) {
|
||||
this.content = this.content.replace("\"S" + i + "\"", "\"" + attributes.get(i) + "\"");
|
||||
this.content = this.content.replace("'S" + i + "'", "'" + attributes.get(i) + "'");
|
||||
}
|
||||
|
||||
for(; i > 0; i--)
|
||||
this.attributes.removeFirst();
|
||||
}
|
||||
|
||||
protected void extractTexts() {
|
||||
Pattern pattern = Pattern.compile(">([^<]+)(?=<)", Pattern.DOTALL);
|
||||
|
||||
Matcher matcher = pattern.matcher(content);
|
||||
|
||||
|
||||
int index = 0;
|
||||
|
||||
while (matcher.find()) {
|
||||
if(StringUtils.isEmptyString(matcher.group(1)))
|
||||
continue;
|
||||
|
||||
this.content = this.content.replace(">" + matcher.group(1) + "<", ">T" + index + "<");
|
||||
|
||||
this.texts.add(matcher.group(1));
|
||||
|
||||
index++;
|
||||
}
|
||||
}
|
||||
|
||||
protected void insertTexts() {
|
||||
if(this.texts.isEmpty())
|
||||
return;
|
||||
|
||||
int i = 0;
|
||||
for(; i < this.texts.size(); i++)
|
||||
this.content = this.content.replace(">T" + i + "<", ">" + this.texts.get(i) + "<");
|
||||
|
||||
for(; i > 0; i--)
|
||||
this.texts.removeFirst();
|
||||
}
|
||||
|
||||
protected void extractTags() {
|
||||
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user