java/org/chromium/distiller/extractors/embeds/ImageExtractor.java - chromium/dom-distiller - Git at Google

 // Copyright 2015 The Chromium Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 package org.chromium.distiller.extractors.embeds;

 import com.google.gwt.dom.client.Document;
 import com.google.gwt.dom.client.Element;
 import com.google.gwt.dom.client.ImageElement;
 import com.google.gwt.dom.client.NodeList;
 import org.chromium.distiller.DomUtil;
 import org.chromium.distiller.JavaScript;
 import org.chromium.distiller.LogUtil;
 import org.chromium.distiller.StringUtil;
 import org.chromium.distiller.webdocument.WebFigure;
 import org.chromium.distiller.webdocument.WebImage;

 import java.util.HashSet;
 import java.util.Set;

 /**
  * This class treats images as another type of embed and provides heuristics for lead image
  * candidacy.
  */
 public class ImageExtractor implements EmbedExtractor {
     private static final Set<String> relevantTags = new HashSet<>();
     private String imgSrc;
     private int width;
     private int height;

     static {
         // TODO(mdjones): Add "DIV" to this list for css images and possibly captions.
         relevantTags.add("IMG");
         relevantTags.add("PICTURE");
         relevantTags.add("FIGURE");
         relevantTags.add("SPAN");
     }

     private static final String[] LAZY_IMAGE_ATTRIBUTES =
             {"data-src", "data-original", "datasrc", "data-url"};

     @Override
     public Set<String> getRelevantTagNames() {
         return relevantTags;
     }

     @Override
     public WebImage extract(Element e) {
         if (!relevantTags.contains(e.getTagName())) {
             return null;
         }
         imgSrc = "";

         ImageElement ie = ImageElement.as(DomUtil.getFirstElementByTagNameInc(e, "IMG"));

         if ("FIGURE".equals(e.getTagName())) {
             Element img = DomUtil.getFirstElementByTagName(e, "PICTURE");
             if (img == null) {
                 img = DomUtil.getFirstElementByTagName(e, "IMG");
             }
             if (img == null) {
                 return null;
             }
             extractImageAttributes(ie);
             Element figcaption;
             Element cap = DomUtil.getFirstElementByTagName(e, "FIGCAPTION");
             if (cap != null) {
                 // We look for links because some sites put non-caption
                 // elements into <figcaption>. For example: image credit
                 // could contain a link. So we get the whole DOM structure within
                 // <figcaption> only when it contains links, otherwise we get the innerText.
                 NodeList<Element> links = DomUtil.querySelectorAll(cap, "A[HREF]");
                 figcaption = links.getLength() > 0 ?
                         cap : createFigcaptionElement(cap);
             } else {
                 figcaption = createFigcaptionElement(e);
             }
             return new WebFigure(img, width, height, imgSrc, figcaption);
         }

         if ("SPAN".equals(e.getTagName())) {
             if (!e.getAttribute("class").contains("lazy-image-placeholder")) {
                 return null;
             }
             // Image lazy loading on Wikipedia.
             ie = Document.get().createImageElement();
             imgSrc = e.getAttribute("data-src");
             width = JavaScript.parseInt(e.getAttribute("data-width"));
             height = JavaScript.parseInt(e.getAttribute("data-height"));
             ie.setAttribute("srcset", e.getAttribute("data-srcset"));
             return new WebImage(ie, width, height, imgSrc);
         }

         extractImageAttributes(ie);
         return new WebImage(e, width, height, imgSrc);
     }

     private void extractImageAttributes(ImageElement imageElement) {
         // This will get the absolute URL of the image and
         // the displayed image dimension.
         // Try to get lazily-loaded images before falling back to get the src attribute.
         for (String attr : LAZY_IMAGE_ATTRIBUTES) {
             imgSrc = imageElement.getAttribute(attr);
             if (!imgSrc.isEmpty())
                 break;
         }
         if (!imgSrc.isEmpty()) {
             // We cannot trust the dimension if the image is not loaded yet.
             // In some cases there are 1x1 placeholder images.
             width = 0;
             height = 0;
         } else {
             imgSrc = imageElement.getSrc();
             // As an ImageElement is manipulated here, it is possible
             // to get the real dimensions.
             width = imageElement.getWidth();
             height = imageElement.getHeight();
         }
         if (LogUtil.isLoggable(LogUtil.DEBUG_LEVEL_VISIBILITY_INFO)) {
             LogUtil.logToConsole("Extracted WebImage: " + imgSrc);
         }
     }

     private Element createFigcaptionElement(Element element) {
         Element figcaption = Document.get().createElement("FIGCAPTION");
         // element.innerText might contain leading/trailing new lines or whitespaces when running
         // in older version of Chrome (before http://crrev.com/c/1114673). Both behaviors are
         // acceptable.
         // See https://crbug.com/651764.
         figcaption.setInnerText(StringUtil.jsTrim(DomUtil.getInnerText(element)));
         return figcaption;
     }
 }
	// Copyright 2015 The Chromium Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style license that can be
	// found in the LICENSE file.

	package org.chromium.distiller.extractors.embeds;

	import com.google.gwt.dom.client.Document;
	import com.google.gwt.dom.client.Element;
	import com.google.gwt.dom.client.ImageElement;
	import com.google.gwt.dom.client.NodeList;
	import org.chromium.distiller.DomUtil;
	import org.chromium.distiller.JavaScript;
	import org.chromium.distiller.LogUtil;
	import org.chromium.distiller.StringUtil;
	import org.chromium.distiller.webdocument.WebFigure;
	import org.chromium.distiller.webdocument.WebImage;

	import java.util.HashSet;
	import java.util.Set;

	/**
	* This class treats images as another type of embed and provides heuristics for lead image
	* candidacy.
	*/
	public class ImageExtractor implements EmbedExtractor {
	private static final Set<String> relevantTags = new HashSet<>();
	private String imgSrc;
	private int width;
	private int height;

	static {
	// TODO(mdjones): Add "DIV" to this list for css images and possibly captions.
	relevantTags.add("IMG");
	relevantTags.add("PICTURE");
	relevantTags.add("FIGURE");
	relevantTags.add("SPAN");
	}

	private static final String[] LAZY_IMAGE_ATTRIBUTES =
	{"data-src", "data-original", "datasrc", "data-url"};

	@Override
	public Set<String> getRelevantTagNames() {
	return relevantTags;
	}

	@Override
	public WebImage extract(Element e) {
	if (!relevantTags.contains(e.getTagName())) {
	return null;
	}
	imgSrc = "";

	ImageElement ie = ImageElement.as(DomUtil.getFirstElementByTagNameInc(e, "IMG"));

	if ("FIGURE".equals(e.getTagName())) {
	Element img = DomUtil.getFirstElementByTagName(e, "PICTURE");
	if (img == null) {
	img = DomUtil.getFirstElementByTagName(e, "IMG");
	}
	if (img == null) {
	return null;
	}
	extractImageAttributes(ie);
	Element figcaption;
	Element cap = DomUtil.getFirstElementByTagName(e, "FIGCAPTION");
	if (cap != null) {
	// We look for links because some sites put non-caption
	// elements into <figcaption>. For example: image credit
	// could contain a link. So we get the whole DOM structure within
	// <figcaption> only when it contains links, otherwise we get the innerText.
	NodeList<Element> links = DomUtil.querySelectorAll(cap, "A[HREF]");
	figcaption = links.getLength() > 0 ?
	cap : createFigcaptionElement(cap);
	} else {
	figcaption = createFigcaptionElement(e);
	}
	return new WebFigure(img, width, height, imgSrc, figcaption);
	}

	if ("SPAN".equals(e.getTagName())) {
	if (!e.getAttribute("class").contains("lazy-image-placeholder")) {
	return null;
	}
	// Image lazy loading on Wikipedia.
	ie = Document.get().createImageElement();
	imgSrc = e.getAttribute("data-src");
	width = JavaScript.parseInt(e.getAttribute("data-width"));
	height = JavaScript.parseInt(e.getAttribute("data-height"));
	ie.setAttribute("srcset", e.getAttribute("data-srcset"));
	return new WebImage(ie, width, height, imgSrc);
	}

	extractImageAttributes(ie);
	return new WebImage(e, width, height, imgSrc);
	}

	private void extractImageAttributes(ImageElement imageElement) {
	// This will get the absolute URL of the image and
	// the displayed image dimension.
	// Try to get lazily-loaded images before falling back to get the src attribute.
	for (String attr : LAZY_IMAGE_ATTRIBUTES) {
	imgSrc = imageElement.getAttribute(attr);
	if (!imgSrc.isEmpty())
	break;
	}
	if (!imgSrc.isEmpty()) {
	// We cannot trust the dimension if the image is not loaded yet.
	// In some cases there are 1x1 placeholder images.
	width = 0;
	height = 0;
	} else {
	imgSrc = imageElement.getSrc();
	// As an ImageElement is manipulated here, it is possible
	// to get the real dimensions.
	width = imageElement.getWidth();
	height = imageElement.getHeight();
	}
	if (LogUtil.isLoggable(LogUtil.DEBUG_LEVEL_VISIBILITY_INFO)) {
	LogUtil.logToConsole("Extracted WebImage: " + imgSrc);
	}
	}

	private Element createFigcaptionElement(Element element) {
	Element figcaption = Document.get().createElement("FIGCAPTION");
	// element.innerText might contain leading/trailing new lines or whitespaces when running
	// in older version of Chrome (before http://crrev.com/c/1114673). Both behaviors are
	// acceptable.
	// See https://crbug.com/651764.
	figcaption.setInnerText(StringUtil.jsTrim(DomUtil.getInnerText(element)));
	return figcaption;
	}
	}