Node-Fetch API and Blob to convert Windows-1252 characters to utf-8

2 years ago

#202715

allez l'OM

Node.js parsing a HTML page, with Windows-1252 characters. Using node-fetch and response.text() looses all accentuated characters (gives a "�" for any kind of diachritics). Instead, response.blob() keeps everything, then a FileReaderSync.readAsText(encoding), is expected to be a solution.

I had succeeded in parsing and decoding Windows-Excel csv files with a JS browser/workers version. But with node.js, I am using the npm "filereader" version of the API.

const fetch = require('node-fetch');
const FileReader = require('filereader');
const FileReaderSync = require('filereader');
const XMLHttpRequest = require('node-http-xhr');

function processDO(html){
    const wrong_accent = "�",
          correct = html.includes("général"); // exists in original html
    console.log("result:", html.length, "accept accents?:", correct);
    return correct;
}
function textFetchAndSave(uri, process){
    return new Promise((resolve, reject) =>
        fetch(uri)
        .then(a => console.log("--\tbefore text") || a.text())
        .then(b => process(b)? resolve("decoded") : reject("wrong accents"))
        .catch(e => console.error(`textFetchAndSave error on ${uri}`))
    )
}
function blobFetchAndSync(uri, process){
    const readAsText_Sync = (charset, resolve) => blob => {
        function loadReader(reader, blob) {
            console.log("entering loadReaderSync", blob.type, blob.size, charset);
            let domstr = "not read yet";
            try { domstr = reader.readAsText(blob, charset);
            } catch (err) {console.error("error loadreadersync", err);}
            console.log("readAsText_Sync ->", domstr);
            return process(domstr)? resolve(true) : reject(false);
        }
        const reader = new FileReaderSync();
        return loadReader(reader, blob);
    };
    return new Promise((resolve, reject) => {
        fetch(uri)
        .then(a => console.log("--\tbefore blob") || a.blob())
        .then(blob => console.log("blob decode", blob.type, blob.size) || readAsText_Sync("windows-1252", resolve)(blob) )
        .catch(e => console.error(`blobFetchAndSync error`))
    });
}
// run
var p = "https://www.france-ferroviaire.info/Test/index.php";
var q = "?mod=FranceFer&ac=affichage&Type=Info&ID_ResData=32&ID_Data=2139444";
(async (pagename) => {
    const uri = encodeURI(pagename);
    try {
        let res = await textFetchAndSave(uri, processDO);
        console.log("text() result", res);
    } catch (err) {console.error("error .text():", err);}
    try {
        let res = await blobFetchAndSync(uri, processDO);
        console.log("blob() result", res);
    } catch (err) {console.error("error blob+fileReaderSync:", err);}
})(p+"?"+q);

The try with response.text() works, but looses accents. Here is the output:

--      before text
result: 27095 accept accents?: false
error .text(): wrong accents

It works as expected: wrong accents, but text is read (and parsed, and saved in the full version of the program).

The try with blob() prints this:

--      before blob
blob decode text/html; charset=iso-8859-1 27119
entering loadReaderSync text/html; charset=iso-8859-1 27119 windows-1252
error with readersync.readAsText Error: cannot read as File: {}
at readFile (C:\Users\igm\WWW\node_modules\filereader\FileReader.js:266:15)
    at FileReader.self.readAsText (C:\Users\igm\WWW\node_modules\filereader\FileReader.js:295:7)
    at loadReader (C:\Users\igm\WWW\blobFerroviaire.js:55:21)
    at C:\Users\igm\WWW\blobFerroviaire.js:62:10
    at C:\Users\igm\WWW\blobFerroviaire.js:67:106
    at processTicksAndRejections (node:internal/process/task_queues:96:5)
    readAsText_Sync -> not read yet
    blobFetchAndSync error

The blob looks correct in mime and charset, but the error comes from the execution of 'reader.readAsText' telling "cannot read as File:". Indeed it is a Blob. From developer.mozilla:

the FileReaderSync interface allows to read File or Blob objects in a synchronous way into an DOMString

Is it an error in my program ? or in the npm filereader?

Though this doesn't answer my question about using Blob + FileReader APIs, the similar question, suggested by "Phil" JavaScript Fetch: characters with encoding issues solves the issue that I was facing, with a combination ArrayBuffer + TextDecode APIs.

Here is the code:

const fetch = require('node-fetch');
function arrayBufferFetchAndSave(uri, process){
    return new Promise((resolve, reject) =>
        fetch(uri)
        .then(a => a.arrayBuffer())
        .then(buffer => {
            const decoder = new TextDecoder('iso-8859-1');
            const decoded = decoder.decode(buffer);
            process(decoded)? resolve("ok") : reject("wrong");
        })
        .catch(e => console.error(`arrayBufferFetchAndSave error`))
    );
}
(async (pagename) => {
    const uri = encodeURI(pagename);
    try {
        let res = await arrayBufferFetchAndSave(uri, processDO);
        console.log("result:", res);
    } catch (err) {console.error(err);}
})(p+"?"+q);

And 'TextDecoder' is in core node.js, no import!

But the 'npm filereader' is still not working as it should do...

node.js

fetch-api

cp1252

0 Answers

Your Answer

Posts

Questions

Blogs

Jobs