Merge pull request #2668 from simong/tidy
Tidy HTML before trying to convert it with abiword
This commit is contained in:
commit
5615bab0d9
5 changed files with 156 additions and 35 deletions
|
@ -10,12 +10,12 @@
|
||||||
// favicon default name
|
// favicon default name
|
||||||
// alternatively, set up a fully specified Url to your own favicon
|
// alternatively, set up a fully specified Url to your own favicon
|
||||||
"favicon": "favicon.ico",
|
"favicon": "favicon.ico",
|
||||||
|
|
||||||
//IP and port which etherpad should bind at
|
//IP and port which etherpad should bind at
|
||||||
"ip": "0.0.0.0",
|
"ip": "0.0.0.0",
|
||||||
"port" : 9001,
|
"port" : 9001,
|
||||||
|
|
||||||
/*
|
/*
|
||||||
// Node native SSL support
|
// Node native SSL support
|
||||||
// this is disabled by default
|
// this is disabled by default
|
||||||
//
|
//
|
||||||
|
@ -37,17 +37,17 @@
|
||||||
"dbSettings" : {
|
"dbSettings" : {
|
||||||
"filename" : "var/dirty.db"
|
"filename" : "var/dirty.db"
|
||||||
},
|
},
|
||||||
|
|
||||||
/* An Example of MySQL Configuration
|
/* An Example of MySQL Configuration
|
||||||
"dbType" : "mysql",
|
"dbType" : "mysql",
|
||||||
"dbSettings" : {
|
"dbSettings" : {
|
||||||
"user" : "root",
|
"user" : "root",
|
||||||
"host" : "localhost",
|
"host" : "localhost",
|
||||||
"password": "",
|
"password": "",
|
||||||
"database": "store"
|
"database": "store"
|
||||||
},
|
},
|
||||||
*/
|
*/
|
||||||
|
|
||||||
//the default text of a pad
|
//the default text of a pad
|
||||||
"defaultPadText" : "Welcome to Etherpad!\n\nThis pad text is synchronized as you type, so that everyone viewing this page sees the same text. This allows you to collaborate seamlessly on documents!\n\nGet involved with Etherpad at http:\/\/etherpad.org\n",
|
"defaultPadText" : "Welcome to Etherpad!\n\nThis pad text is synchronized as you type, so that everyone viewing this page sees the same text. This allows you to collaborate seamlessly on documents!\n\nGet involved with Etherpad at http:\/\/etherpad.org\n",
|
||||||
|
|
||||||
|
@ -65,7 +65,7 @@
|
||||||
"chatAndUsers": false,
|
"chatAndUsers": false,
|
||||||
"lang": "en-gb"
|
"lang": "en-gb"
|
||||||
},
|
},
|
||||||
|
|
||||||
/* Shoud we suppress errors from being visible in the default Pad Text? */
|
/* Shoud we suppress errors from being visible in the default Pad Text? */
|
||||||
"suppressErrorsInPadText" : false,
|
"suppressErrorsInPadText" : false,
|
||||||
|
|
||||||
|
@ -77,35 +77,39 @@
|
||||||
|
|
||||||
/* Users, who have a valid session, automatically get granted access to password protected pads */
|
/* Users, who have a valid session, automatically get granted access to password protected pads */
|
||||||
"sessionNoPassword" : false,
|
"sessionNoPassword" : false,
|
||||||
|
|
||||||
/* if true, all css & js will be minified before sending to the client. This will improve the loading performance massivly,
|
/* if true, all css & js will be minified before sending to the client. This will improve the loading performance massivly,
|
||||||
but makes it impossible to debug the javascript/css */
|
but makes it impossible to debug the javascript/css */
|
||||||
"minify" : true,
|
"minify" : true,
|
||||||
|
|
||||||
/* How long may clients use served javascript code (in seconds)? Without versioning this
|
/* How long may clients use served javascript code (in seconds)? Without versioning this
|
||||||
may cause problems during deployment. Set to 0 to disable caching */
|
may cause problems during deployment. Set to 0 to disable caching */
|
||||||
"maxAge" : 21600, // 60 * 60 * 6 = 6 hours
|
"maxAge" : 21600, // 60 * 60 * 6 = 6 hours
|
||||||
|
|
||||||
/* This is the path to the Abiword executable. Setting it to null, disables abiword.
|
/* This is the path to the Abiword executable. Setting it to null, disables abiword.
|
||||||
Abiword is needed to advanced import/export features of pads*/
|
Abiword is needed to advanced import/export features of pads*/
|
||||||
"abiword" : null,
|
"abiword" : null,
|
||||||
|
|
||||||
|
/* This is the path to the Tidy executable. Setting it to null, disables Tidy.
|
||||||
|
Tidy is used to improve the quality of exported pads*/
|
||||||
|
"tidyHtml" : null,
|
||||||
|
|
||||||
/* Allow import of file types other than the supported types: txt, doc, docx, rtf, odt, html & htm */
|
/* Allow import of file types other than the supported types: txt, doc, docx, rtf, odt, html & htm */
|
||||||
"allowUnknownFileEnds" : true,
|
"allowUnknownFileEnds" : true,
|
||||||
|
|
||||||
/* This setting is used if you require authentication of all users.
|
/* This setting is used if you require authentication of all users.
|
||||||
Note: /admin always requires authentication. */
|
Note: /admin always requires authentication. */
|
||||||
"requireAuthentication" : false,
|
"requireAuthentication" : false,
|
||||||
|
|
||||||
/* Require authorization by a module, or a user with is_admin set, see below. */
|
/* Require authorization by a module, or a user with is_admin set, see below. */
|
||||||
"requireAuthorization" : false,
|
"requireAuthorization" : false,
|
||||||
|
|
||||||
/*when you use NginX or another proxy/ load-balancer set this to true*/
|
/*when you use NginX or another proxy/ load-balancer set this to true*/
|
||||||
"trustProxy" : false,
|
"trustProxy" : false,
|
||||||
|
|
||||||
/* Privacy: disable IP logging */
|
/* Privacy: disable IP logging */
|
||||||
"disableIPlogging" : false,
|
"disableIPlogging" : false,
|
||||||
|
|
||||||
/* Users for basic authentication. is_admin = true gives access to /admin.
|
/* Users for basic authentication. is_admin = true gives access to /admin.
|
||||||
If you do not uncomment this, /admin will not be available! */
|
If you do not uncomment this, /admin will not be available! */
|
||||||
/*
|
/*
|
||||||
|
@ -126,7 +130,7 @@
|
||||||
|
|
||||||
// Allow Load Testing tools to hit the Etherpad Instance. Warning this will disable security on the instance.
|
// Allow Load Testing tools to hit the Etherpad Instance. Warning this will disable security on the instance.
|
||||||
"loadTest": false,
|
"loadTest": false,
|
||||||
|
|
||||||
/* The toolbar buttons configuration.
|
/* The toolbar buttons configuration.
|
||||||
"toolbar": {
|
"toolbar": {
|
||||||
"left": [
|
"left": [
|
||||||
|
@ -148,7 +152,7 @@
|
||||||
|
|
||||||
/* The log level we are using, can be: DEBUG, INFO, WARN, ERROR */
|
/* The log level we are using, can be: DEBUG, INFO, WARN, ERROR */
|
||||||
"loglevel": "INFO",
|
"loglevel": "INFO",
|
||||||
|
|
||||||
//Logging configuration. See log4js documentation for further information
|
//Logging configuration. See log4js documentation for further information
|
||||||
// https://github.com/nomiddlename/log4js-node
|
// https://github.com/nomiddlename/log4js-node
|
||||||
// You can add as many appenders as you want here:
|
// You can add as many appenders as you want here:
|
||||||
|
|
|
@ -28,6 +28,7 @@ var fs = require("fs");
|
||||||
var settings = require('../utils/Settings');
|
var settings = require('../utils/Settings');
|
||||||
var os = require('os');
|
var os = require('os');
|
||||||
var hooks = require("ep_etherpad-lite/static/js/pluginfw/hooks");
|
var hooks = require("ep_etherpad-lite/static/js/pluginfw/hooks");
|
||||||
|
var TidyHtml = require('../utils/TidyHtml');
|
||||||
|
|
||||||
//load abiword only if its enabled
|
//load abiword only if its enabled
|
||||||
if(settings.abiword != null)
|
if(settings.abiword != null)
|
||||||
|
@ -35,28 +36,28 @@ if(settings.abiword != null)
|
||||||
|
|
||||||
var tempDirectory = "/tmp";
|
var tempDirectory = "/tmp";
|
||||||
|
|
||||||
//tempDirectory changes if the operating system is windows
|
//tempDirectory changes if the operating system is windows
|
||||||
if(os.type().indexOf("Windows") > -1)
|
if(os.type().indexOf("Windows") > -1)
|
||||||
{
|
{
|
||||||
tempDirectory = process.env.TEMP;
|
tempDirectory = process.env.TEMP;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* do a requested export
|
* do a requested export
|
||||||
*/
|
*/
|
||||||
exports.doExport = function(req, res, padId, type)
|
exports.doExport = function(req, res, padId, type)
|
||||||
{
|
{
|
||||||
var fileName = padId;
|
var fileName = padId;
|
||||||
|
|
||||||
// allow fileName to be overwritten by a hook, the type type is kept static for security reasons
|
// allow fileName to be overwritten by a hook, the type type is kept static for security reasons
|
||||||
hooks.aCallFirst("exportFileName", padId,
|
hooks.aCallFirst("exportFileName", padId,
|
||||||
function(err, hookFileName){
|
function(err, hookFileName){
|
||||||
// if fileName is set then set it to the padId, note that fileName is returned as an array.
|
// if fileName is set then set it to the padId, note that fileName is returned as an array.
|
||||||
if(hookFileName.length) fileName = hookFileName;
|
if(hookFileName.length) fileName = hookFileName;
|
||||||
|
|
||||||
//tell the browser that this is a downloadable file
|
//tell the browser that this is a downloadable file
|
||||||
res.attachment(fileName + "." + type);
|
res.attachment(fileName + "." + type);
|
||||||
|
|
||||||
//if this is a plain text export, we can do this directly
|
//if this is a plain text export, we can do this directly
|
||||||
// We have to over engineer this because tabs are stored as attributes and not plain text
|
// We have to over engineer this because tabs are stored as attributes and not plain text
|
||||||
if(type == "etherpad"){
|
if(type == "etherpad"){
|
||||||
|
@ -72,7 +73,7 @@ exports.doExport = function(req, res, padId, type)
|
||||||
var txt;
|
var txt;
|
||||||
var randNum;
|
var randNum;
|
||||||
var srcFile, destFile;
|
var srcFile, destFile;
|
||||||
|
|
||||||
async.series([
|
async.series([
|
||||||
//render the txt document
|
//render the txt document
|
||||||
function(callback)
|
function(callback)
|
||||||
|
@ -96,7 +97,7 @@ exports.doExport = function(req, res, padId, type)
|
||||||
{
|
{
|
||||||
//ensure html can be collected by the garbage collector
|
//ensure html can be collected by the garbage collector
|
||||||
txt = null;
|
txt = null;
|
||||||
|
|
||||||
destFile = tempDirectory + "/etherpad_export_" + randNum + "." + type;
|
destFile = tempDirectory + "/etherpad_export_" + randNum + "." + type;
|
||||||
abiword.convertFile(srcFile, destFile, type, callback);
|
abiword.convertFile(srcFile, destFile, type, callback);
|
||||||
},
|
},
|
||||||
|
@ -140,7 +141,7 @@ exports.doExport = function(req, res, padId, type)
|
||||||
var html;
|
var html;
|
||||||
var randNum;
|
var randNum;
|
||||||
var srcFile, destFile;
|
var srcFile, destFile;
|
||||||
|
|
||||||
async.series([
|
async.series([
|
||||||
//render the html document
|
//render the html document
|
||||||
function(callback)
|
function(callback)
|
||||||
|
@ -150,7 +151,7 @@ exports.doExport = function(req, res, padId, type)
|
||||||
if(ERR(err, callback)) return;
|
if(ERR(err, callback)) return;
|
||||||
html = _html;
|
html = _html;
|
||||||
callback();
|
callback();
|
||||||
});
|
});
|
||||||
},
|
},
|
||||||
//decide what to do with the html export
|
//decide what to do with the html export
|
||||||
function(callback)
|
function(callback)
|
||||||
|
@ -162,22 +163,29 @@ exports.doExport = function(req, res, padId, type)
|
||||||
hooks.aCallFirst("exportHTMLSend", html, function(err, newHTML){
|
hooks.aCallFirst("exportHTMLSend", html, function(err, newHTML){
|
||||||
if(newHTML.length) html = newHTML;
|
if(newHTML.length) html = newHTML;
|
||||||
res.send(html);
|
res.send(html);
|
||||||
callback("stop");
|
callback("stop");
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
else //write the html export to a file
|
else //write the html export to a file
|
||||||
{
|
{
|
||||||
randNum = Math.floor(Math.random()*0xFFFFFFFF);
|
randNum = Math.floor(Math.random()*0xFFFFFFFF);
|
||||||
srcFile = tempDirectory + "/etherpad_export_" + randNum + ".html";
|
srcFile = tempDirectory + "/etherpad_export_" + randNum + ".html";
|
||||||
fs.writeFile(srcFile, html, callback);
|
fs.writeFile(srcFile, html, callback);
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
//send the convert job to abiword
|
|
||||||
|
// Tidy up the exported HTML
|
||||||
function(callback)
|
function(callback)
|
||||||
{
|
{
|
||||||
//ensure html can be collected by the garbage collector
|
//ensure html can be collected by the garbage collector
|
||||||
html = null;
|
html = null;
|
||||||
|
|
||||||
|
TidyHtml.tidy(srcFile, callback);
|
||||||
|
},
|
||||||
|
|
||||||
|
//send the convert job to abiword
|
||||||
|
function(callback)
|
||||||
|
{
|
||||||
destFile = tempDirectory + "/etherpad_export_" + randNum + "." + type;
|
destFile = tempDirectory + "/etherpad_export_" + randNum + "." + type;
|
||||||
abiword.convertFile(srcFile, destFile, type, callback);
|
abiword.convertFile(srcFile, destFile, type, callback);
|
||||||
},
|
},
|
||||||
|
@ -199,7 +207,7 @@ exports.doExport = function(req, res, padId, type)
|
||||||
//100ms delay to accomidate for slow windows fs
|
//100ms delay to accomidate for slow windows fs
|
||||||
if(os.type().indexOf("Windows") > -1)
|
if(os.type().indexOf("Windows") > -1)
|
||||||
{
|
{
|
||||||
setTimeout(function()
|
setTimeout(function()
|
||||||
{
|
{
|
||||||
fs.unlink(destFile, callback);
|
fs.unlink(destFile, callback);
|
||||||
}, 100);
|
}, 100);
|
||||||
|
|
|
@ -152,6 +152,11 @@ exports.minify = true;
|
||||||
*/
|
*/
|
||||||
exports.abiword = null;
|
exports.abiword = null;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The path of the tidy executable
|
||||||
|
*/
|
||||||
|
exports.tidyHtml = null;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Should we support none natively supported file types on import?
|
* Should we support none natively supported file types on import?
|
||||||
*/
|
*/
|
||||||
|
@ -167,7 +172,7 @@ exports.loglevel = "INFO";
|
||||||
*/
|
*/
|
||||||
exports.disableIPlogging = false;
|
exports.disableIPlogging = false;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Disable Load Testing
|
* Disable Load Testing
|
||||||
*/
|
*/
|
||||||
exports.loadTest = false;
|
exports.loadTest = false;
|
||||||
|
@ -239,7 +244,7 @@ exports.reloadSettings = function reloadSettings() {
|
||||||
} else {
|
} else {
|
||||||
settingsFilename = path.resolve(path.join(exports.root, settingsFilename));
|
settingsFilename = path.resolve(path.join(exports.root, settingsFilename));
|
||||||
}
|
}
|
||||||
|
|
||||||
var settingsStr;
|
var settingsStr;
|
||||||
try{
|
try{
|
||||||
//read the settings sync
|
//read the settings sync
|
||||||
|
|
41
src/node/utils/TidyHtml.js
Normal file
41
src/node/utils/TidyHtml.js
Normal file
|
@ -0,0 +1,41 @@
|
||||||
|
/**
|
||||||
|
* Tidy up the HTML in a given file
|
||||||
|
*/
|
||||||
|
|
||||||
|
var log4js = require('log4js');
|
||||||
|
var settings = require('./Settings');
|
||||||
|
var spawn = require('child_process').spawn;
|
||||||
|
|
||||||
|
exports.tidy = function(srcFile, callback) {
|
||||||
|
var logger = log4js.getLogger('TidyHtml');
|
||||||
|
|
||||||
|
// Don't do anything if Tidy hasn't been enabled
|
||||||
|
if (!settings.tidyHtml) {
|
||||||
|
logger.debug('tidyHtml has not been configured yet, ignoring tidy request');
|
||||||
|
return callback(null);
|
||||||
|
}
|
||||||
|
|
||||||
|
var errMessage = '';
|
||||||
|
|
||||||
|
// Spawn a new tidy instance that cleans up the file inline
|
||||||
|
logger.debug('Tidying ' + srcFile);
|
||||||
|
var tidy = spawn(settings.tidyHtml, ['-modify', srcFile]);
|
||||||
|
|
||||||
|
// Keep track of any error messages
|
||||||
|
tidy.stderr.on('data', function (data) {
|
||||||
|
errMessage += data.toString();
|
||||||
|
});
|
||||||
|
|
||||||
|
// Wait until Tidy is done
|
||||||
|
tidy.on('close', function(code) {
|
||||||
|
// Tidy returns a 0 when no errors occur and a 1 exit code when
|
||||||
|
// the file could be tidied but a few warnings were generated
|
||||||
|
if (code === 0 || code === 1) {
|
||||||
|
logger.debug('Tidied ' + srcFile + ' successfully');
|
||||||
|
return callback(null);
|
||||||
|
} else {
|
||||||
|
logger.error('Failed to tidy ' + srcFile + '\n' + errMessage);
|
||||||
|
return callback('Tidy died with exit code ' + code);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
};
|
63
tests/backend/specs/api/tidy.js
Normal file
63
tests/backend/specs/api/tidy.js
Normal file
|
@ -0,0 +1,63 @@
|
||||||
|
var assert = require('assert')
|
||||||
|
fs = require('fs'),
|
||||||
|
path = require('path'),
|
||||||
|
TidyHtml = null,
|
||||||
|
Settings = null;
|
||||||
|
|
||||||
|
var npm = require("../../../../src/node_modules/npm/lib/npm.js");
|
||||||
|
|
||||||
|
describe('tidyHtml', function() {
|
||||||
|
before(function(done) {
|
||||||
|
npm.load({}, function(err) {
|
||||||
|
assert.ok(!err);
|
||||||
|
TidyHtml = require('../../../../src/node/utils/TidyHtml');
|
||||||
|
Settings = require('../../../../src/node/utils/Settings');
|
||||||
|
return done()
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
it('Tidies HTML', function(done) {
|
||||||
|
// If the user hasn't configured Tidy, we skip this tests as it's required for this test
|
||||||
|
if (!Settings.tidyHtml) {
|
||||||
|
this.skip();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Try to tidy up a bad HTML file
|
||||||
|
var tmpDir = process.env.TEMP || "/tmp";
|
||||||
|
var tmpFile = path.join(tmpDir, 'tmp_' + (Math.floor(Math.random() * 1000000)) + '.html')
|
||||||
|
fs.writeFileSync(tmpFile, '<html><body><p>a paragraph</p><li>List without outer UL</li>trailing closing p</p></body></html>');
|
||||||
|
TidyHtml.tidy(tmpFile, function(err){
|
||||||
|
assert.ok(!err);
|
||||||
|
|
||||||
|
// Read the file again
|
||||||
|
var cleanedHtml = fs.readFileSync(tmpFile).toString();
|
||||||
|
|
||||||
|
var expectedHtml = [
|
||||||
|
'<title></title>',
|
||||||
|
'</head>',
|
||||||
|
'<body>',
|
||||||
|
'<p>a paragraph</p>',
|
||||||
|
'<ul>',
|
||||||
|
'<li>List without outer UL</li>',
|
||||||
|
'<li style="list-style: none">trailing closing p</li>',
|
||||||
|
'</ul>',
|
||||||
|
'</body>',
|
||||||
|
'</html>',
|
||||||
|
].join('\n');
|
||||||
|
assert.notStrictEqual(cleanedHtml.indexOf(expectedHtml), -1);
|
||||||
|
return done();
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
it('can deal with errors', function(done) {
|
||||||
|
// If the user hasn't configured Tidy, we skip this tests as it's required for this test
|
||||||
|
if (!Settings.tidyHtml) {
|
||||||
|
this.skip();
|
||||||
|
}
|
||||||
|
|
||||||
|
TidyHtml.tidy('/some/none/existing/file.html', function(err) {
|
||||||
|
assert.ok(err);
|
||||||
|
return done();
|
||||||
|
});
|
||||||
|
});
|
||||||
|
});
|
Loading…
Reference in a new issue