HashMap value usage
in
Programming Questions
•
1 year ago
I have only used HashMap once so I am not too familiar with it. What I am trying to do is build a list of URLs I have visited with a spider. If the spider has not visited a site then it should be added to the HashMap to get visited later (Note, in the code below the spider does not visit sites stored in the HashMap yet).
Right now if I am adding a new URL (stored as a String) to the HashMap I just store the String and assign the key to be 1. Does it matter what the key is for my purpose? I really am only interested in having a list of sites not visit to avoid repetition of visitation. I imagine the list will become large when it visits multiple sites so I figured a HashMap was the way to go. Here is the code so far:
- String urlBase = "http://www.processing.org";
- String[] source = loadStrings("http://www.processing.org/");
- HashMap<String, Integer> visited = new HashMap<String, Integer>();
- void setup() {
- size(200, 200);
- doVisit(join(source, ""));
- println(visited);
- }
- void draw() {
- }
- String[] cleanURLs(String[] in) {
- int valid = 0;
- for (int i = 0; i < in.length; i++) {
- String lowerIn = in[i].toLowerCase();
- // See if urlBase needs to be added
- if (match(lowerIn, "http://") == null && match(lowerIn, "https://") == null) {
- // If the URL would be valid if urlBase was added do so
- if (lowerIn.indexOf("/", 0) == 0) {
- in[i] = urlBase+in[i];
- valid++;
- }
- else in[i] = null;
- }
- else valid++;
- }
- // Build a String[] of non null elements
- int count = 0;
- String[] hasAttribute = new String[valid];
- for (int i = 0; i < in.length; i++) {
- if (in[i] != null) {
- hasAttribute[count] = in[i];
- count++;
- }
- }
- return hasAttribute;
- }
- // Always make sure attribute is lowercase
- String[] getAttributes(String[] in, String attribute) {
- int valid = 0;
- for (int i = 0; i < in.length; i++) {
- String lowerIn = in[i].toLowerCase();
- // Make sure the attribute is present
- if (match(lowerIn, attribute) != null) {
- int index = lowerIn.indexOf(attribute, 0);
- // Make sure quotes are found
- int start = lowerIn.indexOf('"', index);
- int end = lowerIn.indexOf('"', start+1);
- if (start == -1 || end == -1) {
- start = lowerIn.indexOf("'", index);
- end = lowerIn.indexOf("'", start+1);
- if (start == -1 || end == -1) in[i] = null;
- else {
- in[i] = in[i].substring(start+1, end);
- valid++;
- }
- }
- else {
- in[i] = in[i].substring(start+1, end);
- valid++;
- }
- }
- else in[i] = null;
- }
- // Build a String[] of non null elements
- int count = 0;
- String[] hasAttribute = new String[valid];
- for (int i = 0; i < in.length; i++) {
- if (in[i] != null) {
- hasAttribute[count] = in[i];
- count++;
- }
- }
- return hasAttribute;
- }
- // Always make sure start and end are lowercase
- String[] getTags(String in, String start, String end) {
- ArrayList startIndexes = new ArrayList();
- ArrayList endIndexes = new ArrayList();
- String lowerIn = in.toLowerCase();
- boolean noMatch = false;
- int position = 0;
- // Get all the indexes of start and end keywords
- while (true) {
- int index = lowerIn.indexOf(start, position);
- if (index != -1) {
- startIndexes.add(index);
- position = index+1;
- int indexStop = lowerIn.indexOf(end, position);
- if (indexStop != -1) endIndexes.add(indexStop+end.length());
- else {
- noMatch = true;
- break;
- }
- }
- else break;
- }
- // Return a list of found tags assuming there is not a problem
- if (startIndexes.size() == 0) {
- println(start+" not found");
- String[] thatsBad = new String[0];
- return thatsBad;
- }
- else if (noMatch) {
- println(end+" not found for every case of "+start);
- String[] thatsBad = new String[0];
- return thatsBad;
- }
- else {
- String[] subs = new String[startIndexes.size()];
- for (int i = 0; i < subs.length; i++) {
- subs[i] = in.substring((Integer)startIndexes.get(i), (Integer)endIndexes.get(i));
- }
- return subs;
- }
- }
- void doVisit(String in) {
- // Get the information from the visited website
- String[] anchors = getTags(in, "<a", "</a>");
- if (anchors.length != 0) {
- String[] hrefs = getAttributes(anchors, "href");
- if (hrefs.length != 0) {
- String[] urls = cleanURLs(hrefs);
- if (urls.length != 0) {
- // If information was found add it to the HashMap
- for (int i = 0; i < urls.length; i++) if (!visited.containsKey(urls[i])) visited.put(urls[i], 1);
- }
- else println("URL array empty for "+in);
- }
- else println("Href array empty for "+in);
- }
- else println("Anchor array empty for "+in);
- }
1