MDL-52954 core: Change from pandoc to unoconv - it gives better results

Most importantly it retains formatting better, and supports different charsets far better than pandoc.
This commit is contained in:
Damyon Wiese 2016-02-26 11:52:40 +08:00
parent 128d8736d3
commit 1356d85151
10 changed files with 75 additions and 49 deletions

View file

@ -166,7 +166,7 @@ function behat_clean_init_config() {
'umaskpermissions', 'dbtype', 'dblibrary', 'dbhost', 'dbname', 'dbuser', 'dbpass', 'prefix',
'dboptions', 'proxyhost', 'proxyport', 'proxytype', 'proxyuser', 'proxypassword',
'proxybypass', 'theme', 'pathtogs', 'pathtodu', 'aspellpath', 'pathtodot', 'skiplangupgrade',
'altcacheconfigpath', 'pathtopandoc'
'altcacheconfigpath', 'pathtounoconv'
));
// Add extra allowed settings.

View file

@ -187,21 +187,29 @@ class file_storage {
* @param string $format The desired format - e.g. 'pdf'. Formats are specified by file extension.
* @return bool - True if the format is supported for input.
*/
protected function is_input_format_supported_by_pandoc($format) {
protected function is_format_supported_by_unoconv($format) {
global $CFG;
if (!isset($this->unoconvformats)) {
// Ask unoconv for it's list of supported document formats.
$cmd = escapeshellcmd(trim($CFG->pathtounoconv)) . ' --show';
$pipes = array();
$pipesspec = array(2 => array('pipe', 'w'));
$proc = proc_open($cmd, $pipesspec, $pipes);
$programoutput = stream_get_contents($pipes[2]);
fclose($pipes[2]);
proc_close($proc);
$matches = array();
preg_match_all('/\[\.(.*)\]/', $programoutput, $matches);
$this->unoconvformats = $matches[1];
$this->unoconvformats = array_unique($this->unoconvformats);
}
$sanitized = trim(strtolower($format));
return in_array($sanitized, array('md', 'html', 'tex', 'docx', 'odt', 'epub', 'png', 'jpg', 'gif'));
return in_array($sanitized, $this->unoconvformats);
}
/**
* Verify the format is supported.
*
* @param string $format The desired format - e.g. 'pdf'. Formats are specified by file extension.
* @return bool - True if the format is supported for output.
*/
protected function is_output_format_supported_by_pandoc($format) {
$sanitized = trim(strtolower($format));
return in_array($sanitized, array('md', 'pdf', 'html', 'tex', 'docx', 'odt', 'odf', 'epub'));
}
/**
* Perform a file format conversion on the specified document.
@ -213,17 +221,17 @@ class file_storage {
protected function create_converted_document(stored_file $file, $format) {
global $CFG;
if (empty($CFG->pathtopandoc) || !is_executable(trim($CFG->pathtopandoc))) {
if (empty($CFG->pathtounoconv) || !is_executable(trim($CFG->pathtounoconv))) {
// No conversions are possible, sorry.
return false;
}
$fileextension = strtolower(pathinfo($file->get_filename(), PATHINFO_EXTENSION));
if (!self::is_input_format_supported_by_pandoc($fileextension)) {
if (!self::is_format_supported_by_unoconv($fileextension)) {
return false;
}
if (!self::is_output_format_supported_by_pandoc($format)) {
if (!self::is_format_supported_by_unoconv($format)) {
return false;
}
@ -236,21 +244,14 @@ class file_storage {
$filename = $tmp . '/' . $localfilename;
$file->copy_content_to($filename);
if (in_array($fileextension, array('gif', 'jpg', 'png'))) {
// We wrap images in a tiny html file - pandoc will generate documents from them.
$htmlwrapperfile = $tmp . '/wrapper.html';
file_put_contents($htmlwrapperfile, "<html><body><img src=\"$localfilename\"></body></html>");
$filename = $htmlwrapperfile;
}
$newtmpfile = pathinfo($filename, PATHINFO_FILENAME) . '.' . $format;
// Safety.
$newtmpfile = $tmp . '/' . clean_param($newtmpfile, PARAM_FILE);
$cmd = escapeshellcmd(trim($CFG->pathtopandoc)) . ' ' .
$cmd = escapeshellcmd(trim($CFG->pathtounoconv)) . ' ' .
escapeshellarg('-f') . ' ' .
escapeshellarg($format) . ' ' .
escapeshellarg('-o') . ' ' .
escapeshellarg($newtmpfile) . ' ' .
escapeshellarg($filename);
@ -259,6 +260,7 @@ class file_storage {
$output = null;
$currentdir = getcwd();
chdir($tmp);
$result = exec('env 1>&2', $output);
$result = exec($cmd, $output);
chdir($currentdir);
if (!file_exists($newtmpfile)) {

View file

@ -186,7 +186,7 @@ $allowed = array('wwwroot', 'dataroot', 'dirroot', 'admin', 'directorypermission
'dbtype', 'dblibrary', 'dbhost', 'dbname', 'dbuser', 'dbpass', 'prefix', 'dboptions',
'proxyhost', 'proxyport', 'proxytype', 'proxyuser', 'proxypassword', 'proxybypass', // keep proxy settings from config.php
'altcacheconfigpath', 'pathtogs', 'pathtodu', 'aspellpath', 'pathtodot',
'pathtopandoc'
'pathtounoconv'
);
$productioncfg = (array)$CFG;
$CFG = new stdClass();

View file

@ -15,7 +15,7 @@
// along with Moodle. If not, see <http://www.gnu.org/licenses/>.
/**
* Test pandoc functionality.
* Test unoconv functionality.
*
* @package core
* @category phpunit
@ -27,14 +27,14 @@ defined('MOODLE_INTERNAL') || die();
/**
* A set of tests for some of the pandoc functionality within Moodle.
* A set of tests for some of the unoconv functionality within Moodle.
*
* @package core
* @category phpunit
* @copyright 2016 Damyon Wiese
* @license http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later
*/
class core_pandoc_testcase extends advanced_testcase {
class core_unoconv_testcase extends advanced_testcase {
private $testfile1 = null;
private $testfile2 = null;
@ -51,7 +51,7 @@ class core_pandoc_testcase extends advanced_testcase {
'filepath' => '/',
'filename' => 'test.html'
);
$teststring = file_get_contents($this->fixturepath . DIRECTORY_SEPARATOR . 'pandoc-source.html');
$teststring = file_get_contents($this->fixturepath . DIRECTORY_SEPARATOR . 'unoconv-source.html');
$this->testfile1 = $fs->create_file_from_string($filerecord, $teststring);
$filerecord = array(
@ -62,7 +62,7 @@ class core_pandoc_testcase extends advanced_testcase {
'filepath' => '/',
'filename' => 'test.docx'
);
$teststring = file_get_contents($this->fixturepath . DIRECTORY_SEPARATOR . 'pandoc-source.docx');
$teststring = file_get_contents($this->fixturepath . DIRECTORY_SEPARATOR . 'unoconv-source.docx');
$this->testfile2 = $fs->create_file_from_string($filerecord, $teststring);
$this->resetAfterTest();
@ -71,16 +71,18 @@ class core_pandoc_testcase extends advanced_testcase {
public function test_generate_pdf() {
global $CFG;
if (empty($CFG->pathtopandoc) || !is_executable(trim($CFG->pathtopandoc))) {
if (empty($CFG->pathtounoconv) || !is_executable(trim($CFG->pathtounoconv))) {
// No conversions are possible, sorry.
return $this->markTestSkipped();
}
$fs = get_file_storage();
$result = $fs->get_converted_document($this->testfile1, 'pdf');
$this->assertNotFalse($result);
$this->assertSame($result->get_mimetype(), 'application/pdf');
$this->assertGreaterThan(0, $result->get_filesize());
$result = $fs->get_converted_document($this->testfile2, 'pdf');
$this->assertNotFalse($result);
$this->assertSame($result->get_mimetype(), 'application/pdf');
$this->assertGreaterThan(0, $result->get_filesize());
}
@ -88,16 +90,18 @@ class core_pandoc_testcase extends advanced_testcase {
public function test_generate_markdown() {
global $CFG;
if (empty($CFG->pathtopandoc) || !is_executable(trim($CFG->pathtopandoc))) {
if (empty($CFG->pathtounoconv) || !is_executable(trim($CFG->pathtounoconv))) {
// No conversions are possible, sorry.
return $this->markTestSkipped();
}
$fs = get_file_storage();
$result = $fs->get_converted_document($this->testfile1, 'md');
$result = $fs->get_converted_document($this->testfile1, 'txt');
$this->assertNotFalse($result);
$this->assertSame($result->get_mimetype(), 'text/plain');
$this->assertGreaterThan(0, $result->get_filesize());
$result = $fs->get_converted_document($this->testfile2, 'md');
$result = $fs->get_converted_document($this->testfile2, 'txt');
$this->assertNotFalse($result);
$this->assertSame($result->get_mimetype(), 'text/plain');
$this->assertGreaterThan(0, $result->get_filesize());
}