'use strict';
var EventEmitter = require('events').EventEmitter,
emits = require('emits'),
html = require('htmlparser2'),
domutils = require('domutils'),
util = require('util'),
uuid = require('uuid'),
async = require('async'),
url = require('url'),
request = require('request-promise'),
probeImageSize = require('probe-image-size'),
_ = require('lodash'),
sizeOf = require('image-size'),
validator = require('validator'),
helpers = require('./helpers'),
'amp-img': {
layout: 'responsive',
width: 600,
height: 400
'amp-anim': {
layout: 'responsive',
width: 600,
height: 400
'amp-iframe': {
layout: 'responsive',
width: 600,
height: 400,
sandbox: 'allow-scripts allow-same-origin'
'amp-youtube': {
layout: 'responsive',
width: 600,
height: 400
'request_timeout': 3000
// these are formats supported by image-size but not probe-image-size
'cur', 'icns', 'ico', 'dds'
* Amperizer constructor. Borrows from Minimize.
* https://github.com/Swaagie/minimize/blob/4b815e274a424ca89551d28c4e0dd8b06d9bbdc2/lib/minimize.js#L15
* @constructor
* @param {Object} options Options object
* @api public
function Amperize(options) {
this.config = _.merge({}, DEFAULTS, options || {});
this.emits = emits;
this.htmlParser = new html.Parser(
new html.DomHandler(this.emits('read'))
util.inherits(Amperize, EventEmitter);
* Parse the content and call the callback. Borrowed from Minimize.
* https://github.com/Swaagie/minimize/blob/4b815e274a424ca89551d28c4e0dd8b06d9bbdc2/lib/minimize.js#L51
* @param {String} content HTML
* @param {Function} callback
* @api public
Amperize.prototype.parse = function parse(content, callback) {
var id;
if (typeof callback !== 'function') {
throw new Error('No callback provided');
id = uuid.v4();
this.once('read', this.amperizer.bind(this, id));
this.once('parsed: ' + id, callback);
* Turn a traversible DOM into string content. Borrowed from Minimize.
* https://github.com/Swaagie/minimize/blob/4b815e274a424ca89551d28c4e0dd8b06d9bbdc2/lib/minimize.js#L74
* @param {String} id
* @param {Object} error
* @param {Object} dom Traversible DOM object
* @api private
Amperize.prototype.amperizer = function amperizer(id, error, dom) {
if (error) {
throw new Error('Amperizer failed to parse DOM', error);
this.traverse(dom, '', this.emits('parsed: ' + id));
* Reduce the traversible DOM object to a string. Borrows from Minimize.
* https://github.com/Swaagie/minimize/blob/4b815e274a424ca89551d28c4e0dd8b06d9bbdc2/lib/minimize.js#L90
* @param {Array} data
* @param {String} html Compiled HTML contents
* @param {Function} done Callback function
* @api private
Amperize.prototype.traverse = async function traverse(data, html, done) {
var self = this;
var imageSizeCache = {};
var requestOptions = {
// We need the user-agent, otherwise some https request may fail (e. g. cloudfare)
headers: {
'User-Agent': 'Mozilla/5.0 Safari/537.36'
timeout: self.config['request_timeout'],
encoding: null
// check if element.width is smaller than 300 px. In that case, we shouldn't use
// layout="responsive", because the media element will be stretched and it doesn't
// look nice. Use layout="fixed" instead to fix that.
function setLayoutAttribute(element) {
var layout = element.attribs.width < 300 ? layout = 'fixed' : self.config[element.name].layout;
element.attribs.layout = !element.attribs.layout ? layout : element.attribs.layout;
// Certain component src attribute must be with 'https' protocol otherwise it will not
// get validated by AMP. If we're unable to replace it, we will deal with the valitation
// error, but at least we tried.
function useSecureSchema(element) {
if (element.attribs && element.attribs.src) {
if (element.attribs.src.indexOf('https://') === -1) {
if (element.attribs.src.indexOf('http://') === 0) {
// Replace 'http' with 'https', so the validation passes
element.attribs.src = element.attribs.src.replace(/^http:\/\//i, 'https://');
} else if (element.attribs.src.indexOf('//') === 0) {
// Giphy embedded iFrames are without protocol and start with '//', so at least
// we can fix those cases.
element.attribs.src = 'https:' + element.attribs.src;
// probe will fetch the minimal amount of data needed to determine
// the image dimensions so it's more performant than a full fetch
function _probeImageSize(url) {
return probeImageSize(
).then(function (result) {
imageSizeCache[url] = result;
return result;
// fetch the full image before reading dimensions using image-size,
// it's slower but has better format support
function _fetchImageSize(url) {
return request(
).then(function (response) {
var result = sizeOf(response);
imageSizeCache[url] = result;
return result;
// select appropriate method to get image size
function _getImageSize(url) {
// use cached image size if we've already seen this url
if (imageSizeCache[url]) {
return Promise.resolve(imageSizeCache[url]);
// fetch full image for formats we can't probe
const extensionMatch = url.match(/(?:\.)([a-zA-Z]{3,4})(\?|$)/) || [];
const extension = (extensionMatch[1] || '').toLowerCase();
if (FETCH_ONLY_FORMATS.includes(extension)) {
return _fetchImageSize(url);
// probe partial image everything else
return _probeImageSize(url);
// convert <img> to <amp-img> or <amp-anim>, fetching dimensions of
// external images. If anything fails leave the element as an <img>
function amperizeImageElem(element) {
return async function() {
if (!element.attribs || !element.attribs.src) {
var src = url.parse(element.attribs.src).href;
// when we have a gif it should be <amp-anim>.
element.name = src.match(/(\.gif$)/) ? 'amp-anim' : 'amp-img';
if (src.indexOf('http') === 0) {
// external image, fetch real dimensions
try {
if (!validator.isURL(src)) {
element.name = 'img';
var dimensions = await _getImageSize(src);
// CASE: `.ico` files might have multiple images and therefore multiple sizes.
// We return the largest size found (image-size default is the first size found)
if (dimensions.images) {
dimensions.width = _.maxBy(dimensions.images, function (w) {return w.width;}).width;
dimensions.height = _.maxBy(dimensions.images, function (h) {return h.height;}).height;
if (!dimensions.width || !dimensions.height) {
element.name = 'img';
element.attribs.width = dimensions.width;
element.attribs.height = dimensions.height;
} catch (err) {
element.name = 'img';
} else {
// local image, use default fallback
element.attribs.width = self.config[element.name].width;
element.attribs.height = self.config[element.name].height;
if (!element.attribs.layout) {
// convert all of the img elements first so that we can perform lengthy
// network requests in parallel before sequentially traversing the DOM
if (self.config['amp-img']) {
var imgTest = function(elem) {
return elem.name === 'img' && elem.attribs.src;
var imgElems = domutils.findAll(elem => imgTest(elem), data);
var imgTasks = imgElems.map(elem => amperizeImageElem(elem));
await async.parallelLimit(imgTasks, 10);
// sequentially traverse the DOM
async.reduce(data, html, function reduce(html, element, step) {
var children;
if (/(style|script|textarea|link)/.test(element.name)) {
return step(null, html);
function close(error, html) {
html += helpers.close(element);
step(null, html);
function enter() {
children = element.children;
html += helpers[element.type](element);
if (!children || !children.length) {
return close(null, html);
setImmediate(function delay() {
traverse.call(self, children, html, close);
if (element.name === 'iframe') {
if (!element.attribs.src) {
return enter();
var youtubeId = element.attribs.src.match(/^.*(youtu.be\/|youtube(-nocookie)?.com\/(v\/|.*u\/\w\/|embed\/|.*v=))([\w-]{11}).*/);
if (youtubeId) {
element.name = 'amp-youtube';
element.attribs['data-videoid'] = youtubeId[4];
delete element.attribs.src;
delete element.attribs.sandbox;
delete element.attribs.allowfullscreen;
delete element.attribs.allow;
delete element.attribs.frameborder;
} else {
element.name = 'amp-iframe';
element.attribs.sandbox = !element.attribs.sandbox ? self.config['amp-iframe'].sandbox : element.attribs.sandbox;
if (element.attribs.hasOwnProperty('frameborder')) {
element.attribs.frameborder = element.attribs.frameborder === '0' ? '0' : '1';
if (element.attribs.hasOwnProperty('scrolling')) {
element.attribs.scrolling = element.attribs.scrolling === '0' ? '0' : '1';
if (element.attribs.hasOwnProperty('allowfullscreen')) {
if (element.attribs.allowfullscreen === 'false') {
delete element.attribs.allowfullscreen;
} else {
element.attribs.allowfullscreen = '';
if (element.attribs.hasOwnProperty('allowtransparency')) {
if (element.attribs.allowtransparency === 'false') {
delete element.attribs.allowtransparency;
} else {
element.attribs.allowtransparency = '';
if (!element.attribs.width || !element.attribs.height || !element.attribs.layout) {
element.attribs.width = !element.attribs.width ? self.config['amp-iframe'].width : element.attribs.width;
element.attribs.height = !element.attribs.height ? self.config['amp-iframe'].height : element.attribs.height;
if (element.name === 'audio') {
element.name = 'amp-audio';
if (element.attribs && element.attribs.src && element.parent && element.parent.name === 'amp-audio') {
return enter();
}, done);
module.exports = Amperize;