http://www.briandupreez.net/2013/07/mini-search-engine-just-basics-using.html
Then using Crawler4j created a graph of all the URLs starting with my blog, their relationships to other URLs and all the words and indexes of the words that those URLs contain.
public boolean shouldVisit(final WebURL url) {
final String href = url.getURL().toLowerCase();
return !NodeConstants.FILTERS.matcher(href).matches();
}
/**
* This function is called when a page is fetched and ready
* to be processed by your program.
*/
@Override
public void visit(final Page page) {
final String url = page.getWebURL().getURL();
System.out.println("URL: " + url);
final Index<Node> nodeIndex = graphDb.index().forNodes(NodeConstants.PAGE_INDEX);
if (page.getParseData() instanceof HtmlParseData) {
HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
String text = htmlParseData.getText();
//String html = htmlParseData.getHtml();
List<WebURL> links = htmlParseData.getOutgoingUrls();
Transaction tx = graphDb.beginTx();
try {
final Node pageNode = graphDb.createNode();
pageNode.setProperty(NodeConstants.URL, url);
nodeIndex.add(pageNode, NodeConstants.URL, url);
//get all the words
final List<String> words = cleanAndSplitString(text);
int index = 0;
for (final String word : words) {
final Node wordNode = graphDb.createNode();
wordNode.setProperty(NodeConstants.WORD, word);
wordNode.setProperty(NodeConstants.INDEX, index++);
final Relationship relationship = pageNode.createRelationshipTo(wordNode, RelationshipTypes.CONTAINS);
relationship.setProperty(NodeConstants.SOURCE, url);
}
for (final WebURL webURL : links) {
System.out.println("Linking to " + webURL);
final Node linkNode = graphDb.createNode();
linkNode.setProperty(NodeConstants.URL, webURL.getURL());
final Relationship relationship = pageNode.createRelationshipTo(linkNode, RelationshipTypes.LINK_TO);
relationship.setProperty(NodeConstants.SOURCE, url);
relationship.setProperty(NodeConstants.DESTINATION, webURL.getURL());
}
tx.success();
} finally {
tx.finish();
}
}
}
private static List<String> cleanAndSplitString(final String input) {
if (input != null) {
final String[] dic = input.toLowerCase().replaceAll("\\p{Punct}", "").replaceAll("\\p{Digit}", "").split("\\s+");
return Arrays.asList(dic);
}
return new ArrayList<>();
}
After the data was collected, I could query it and perform the functions of a search engine.
final ExecutorService executorService = Executors.newFixedThreadPool(4);
final String[] searchTerms = {"java", "spring"};
List<Callable<TaskResponse>> tasks = new ArrayList<>();
tasks.add(new WordFrequencyTask(searchTerms));
tasks.add(new DocumentLocationTask(searchTerms));
tasks.add(new PageRankTask(searchTerms));
tasks.add(new NeuralNetworkTask(searchTerms));
final List<Future<TaskResponse>> results = executorService.invokeAll(tasks);
Then using Crawler4j created a graph of all the URLs starting with my blog, their relationships to other URLs and all the words and indexes of the words that those URLs contain.
public boolean shouldVisit(final WebURL url) {
final String href = url.getURL().toLowerCase();
return !NodeConstants.FILTERS.matcher(href).matches();
}
/**
* This function is called when a page is fetched and ready
* to be processed by your program.
*/
@Override
public void visit(final Page page) {
final String url = page.getWebURL().getURL();
System.out.println("URL: " + url);
final Index<Node> nodeIndex = graphDb.index().forNodes(NodeConstants.PAGE_INDEX);
if (page.getParseData() instanceof HtmlParseData) {
HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
String text = htmlParseData.getText();
//String html = htmlParseData.getHtml();
List<WebURL> links = htmlParseData.getOutgoingUrls();
Transaction tx = graphDb.beginTx();
try {
final Node pageNode = graphDb.createNode();
pageNode.setProperty(NodeConstants.URL, url);
nodeIndex.add(pageNode, NodeConstants.URL, url);
//get all the words
final List<String> words = cleanAndSplitString(text);
int index = 0;
for (final String word : words) {
final Node wordNode = graphDb.createNode();
wordNode.setProperty(NodeConstants.WORD, word);
wordNode.setProperty(NodeConstants.INDEX, index++);
final Relationship relationship = pageNode.createRelationshipTo(wordNode, RelationshipTypes.CONTAINS);
relationship.setProperty(NodeConstants.SOURCE, url);
}
for (final WebURL webURL : links) {
System.out.println("Linking to " + webURL);
final Node linkNode = graphDb.createNode();
linkNode.setProperty(NodeConstants.URL, webURL.getURL());
final Relationship relationship = pageNode.createRelationshipTo(linkNode, RelationshipTypes.LINK_TO);
relationship.setProperty(NodeConstants.SOURCE, url);
relationship.setProperty(NodeConstants.DESTINATION, webURL.getURL());
}
tx.success();
} finally {
tx.finish();
}
}
}
private static List<String> cleanAndSplitString(final String input) {
if (input != null) {
final String[] dic = input.toLowerCase().replaceAll("\\p{Punct}", "").replaceAll("\\p{Digit}", "").split("\\s+");
return Arrays.asList(dic);
}
return new ArrayList<>();
}
After the data was collected, I could query it and perform the functions of a search engine.
final ExecutorService executorService = Executors.newFixedThreadPool(4);
final String[] searchTerms = {"java", "spring"};
List<Callable<TaskResponse>> tasks = new ArrayList<>();
tasks.add(new WordFrequencyTask(searchTerms));
tasks.add(new DocumentLocationTask(searchTerms));
tasks.add(new PageRankTask(searchTerms));
tasks.add(new NeuralNetworkTask(searchTerms));
final List<Future<TaskResponse>> results = executorService.invokeAll(tasks);