ofx_preproc.cpp

Go to the documentation of this file.
00001 /***************************************************************************
00002           ofx_preproc.cpp 
00003                              -------------------
00004     copyright            : (C) 2002 by Benoit Gr�oir
00005     email                : bock@step.polymtl.ca
00006 ***************************************************************************/
00012 /***************************************************************************
00013  *                                                                         *
00014  *   This program is free software; you can redistribute it and/or modify  *
00015  *   it under the terms of the GNU General Public License as published by  *
00016  *   the Free Software Foundation; either version 2 of the License, or     *
00017  *   (at your option) any later version.                                   *
00018  *                                                                         *
00019  ***************************************************************************/
00020 #include "../config.h"
00021 #include <iostream>
00022 #include <fstream>
00023 #include <stdlib.h>
00024 #include <stdio.h>
00025 #include <string>
00026 #include "ParserEventGeneratorKit.h"
00027 #include "libofx.h"
00028 #include "messages.hh"
00029 #include "ofx_sgml.hh"
00030 #include "ofc_sgml.hh"
00031 #include "ofx_preproc.hh"
00032 #ifdef HAVE_ICONV
00033 #include <iconv.h>
00034 #endif
00035 
00036 #define LIBOFX_DEFAULT_INPUT_ENCODING "CP1252"
00037 #define LIBOFX_DEFAULT_OUTPUT_ENCODING "UTF-8"
00038 
00039 using namespace std;
00043 #ifdef MAKEFILE_DTD_PATH
00044 const int DTD_SEARCH_PATH_NUM = 4;
00045 #else
00046 const int DTD_SEARCH_PATH_NUM = 3;
00047 #endif
00048  
00052 const char *DTD_SEARCH_PATH[DTD_SEARCH_PATH_NUM] = { 
00053 #ifdef MAKEFILE_DTD_PATH
00054   MAKEFILE_DTD_PATH , 
00055 #endif
00056   "/usr/local/share/libofx/dtd/", 
00057   "/usr/share/libofx/dtd/", 
00058   "~/"};
00059 const unsigned int READ_BUFFER_SIZE = 1024;
00060 
00065 CFCT int ofx_proc_file(LibofxContextPtr ctx, const char * p_filename)
00066   {
00067   LibofxContext *libofx_context;
00068   bool ofx_start=false;
00069   bool ofx_end=false;
00070 
00071   ifstream input_file;
00072   ofstream tmp_file;
00073   char buffer[READ_BUFFER_SIZE];
00074   char iconv_buffer[READ_BUFFER_SIZE];
00075   string s_buffer;
00076   char *filenames[3];
00077   char tmp_filename[50];
00078 #ifdef HAVE_ICONV
00079         iconv_t conversion_descriptor;
00080 #endif
00081   libofx_context=(LibofxContext*)ctx;
00082 
00083   if(p_filename!=NULL&&strcmp(p_filename,"")!=0)
00084     {
00085     message_out(DEBUG, string("ofx_proc_file():Opening file: ")+ p_filename);
00086     
00087     input_file.open(p_filename);
00088     strncpy(tmp_filename,"/tmp/libofxtmpXXXXXX",50);
00089     mkstemp(tmp_filename);
00090     tmp_file.open(tmp_filename);
00091 
00092     message_out(DEBUG,"ofx_proc_file(): Creating temp file: "+string(tmp_filename));
00093     if(!input_file){
00094       message_out(ERROR,"ofx_proc_file():Unable to open the input file "+string(p_filename));
00095     }
00096     else if(!tmp_file){
00097       message_out(ERROR,"ofx_proc_file():Unable to open the output file "+string(tmp_filename));
00098     }
00099     else
00100       {
00101         int header_separator_idx;
00102         string header_name;
00103         string header_value;
00104         string ofx_encoding;
00105         string ofx_charset;
00106         do {
00107           input_file.getline(buffer, sizeof(buffer),'\n');
00108           //cout<<buffer<<"\n";
00109           s_buffer.assign(buffer);
00110           //cout<<"input_file.gcount(): "<<input_file.gcount()<<" sizeof(buffer): "<<sizeof(buffer)<<endl;
00111           if(input_file.gcount()<(sizeof(buffer)-1))
00112             {
00113               s_buffer.append("\n");
00114             }
00115           else if( !input_file.eof()&&input_file.fail())
00116             {
00117               input_file.clear();
00118             }
00119           int ofx_start_idx;
00120           if (ofx_start==false &&
00121               (
00122                (libofx_context->currentFileType()==OFX&&
00123                 ((ofx_start_idx=s_buffer.find("<OFX>"))!=
00124                  string::npos||(ofx_start_idx=s_buffer.find("<ofx>"))!=string::npos))
00125                || (libofx_context->currentFileType()==OFC&&
00126                    ((ofx_start_idx=s_buffer.find("<OFC>"))!=string::npos||
00127                     (ofx_start_idx=s_buffer.find("<ofc>"))!=string::npos))
00128               )
00129              )
00130             {
00131               ofx_start=true;
00132               s_buffer.erase(0,ofx_start_idx);//Fix for really broken files that don't have a newline after the header.
00133               message_out(DEBUG,"ofx_proc_file():<OFX> or <OFC> has been found");
00134 #ifdef HAVE_ICONV
00135               string fromcode;
00136               string tocode; 
00137               if(ofx_encoding.compare("USASCII")==0){
00138                 if(ofx_charset.compare("ISO-8859-1")==0){
00139                   fromcode="ISO-8859-1";
00140                 }
00141                 else if(ofx_charset.compare("1252")==0){
00142                   fromcode="CP1252";
00143                 }
00144                 else if(ofx_charset.compare("NONE")==0){
00145                   fromcode=LIBOFX_DEFAULT_INPUT_ENCODING;
00146                 }
00147               }
00148               else if(ofx_encoding.compare("USASCII")==0) {
00149                 fromcode="UTF-8";
00150               }
00151               else
00152                 {
00153                   fromcode=LIBOFX_DEFAULT_INPUT_ENCODING;
00154                 }
00155               tocode = LIBOFX_DEFAULT_OUTPUT_ENCODING;
00156               message_out(DEBUG,"ofx_proc_file(): Setting up iconv for fromcode: "+fromcode+", tocode: "+tocode);
00157               conversion_descriptor = iconv_open (tocode.c_str(), fromcode.c_str());
00158 #endif
00159             }
00160           else {
00161             //We are still in the headers
00162             if ((header_separator_idx=s_buffer.find(':')) != string::npos) {
00163               //Header processing
00164               header_name.assign(s_buffer.substr(0,header_separator_idx));
00165               header_value.assign(s_buffer.substr(header_separator_idx+1));
00166               message_out(DEBUG,"ofx_proc_file():Header: "+header_name+" with value: "+header_value+" has been found");
00167               if(header_name.compare("ENCODING")==0) {
00168                 ofx_encoding.assign(header_value);
00169               }
00170               if(header_name.compare("CHARSET")==0) {
00171                 ofx_charset.assign(header_value);
00172               }
00173             }
00174           }
00175 
00176           if(ofx_start==true && ofx_end==false){
00177             s_buffer=sanitize_proprietary_tags(s_buffer);
00178             //cout<< s_buffer<<"\n";
00179 #ifdef HAVE_ICONV
00180             memset(iconv_buffer,0,READ_BUFFER_SIZE);
00181             size_t inbytesleft = strlen(s_buffer.c_str());
00182             size_t outbytesleft = READ_BUFFER_SIZE;
00183             char * inchar = (char *)s_buffer.c_str();
00184             char * outchar = iconv_buffer;
00185             int iconv_retval = iconv (conversion_descriptor,
00186                     &inchar, &inbytesleft,
00187                    &outchar, &outbytesleft);
00188             if(iconv_retval==-1){
00189               message_out(ERROR,"ofx_proc_file(): Conversion error");
00190             }
00191             s_buffer = iconv_buffer;
00192 #endif
00193               tmp_file.write(s_buffer.c_str(), s_buffer.length());
00194           }
00195           
00196           if (ofx_start==true &&
00197               (
00198                (libofx_context->currentFileType()==OFX &&
00199                 ((ofx_start_idx=s_buffer.find("</OFX>"))!=string::npos ||
00200                  (ofx_start_idx=s_buffer.find("</ofx>"))!=string::npos))
00201                || (libofx_context->currentFileType()==OFC &&
00202                    ((ofx_start_idx=s_buffer.find("</OFC>"))!=string::npos ||
00203                     (ofx_start_idx=s_buffer.find("</ofc>"))!=string::npos))
00204               )
00205              )
00206             {
00207               ofx_end=true;
00208               message_out(DEBUG,"ofx_proc_file():</OFX> or </OFC>  has been found");
00209             }
00210 
00211         } while(!input_file.eof()&&!input_file.bad());
00212       }
00213     input_file.close();
00214     tmp_file.close();
00215 #ifdef HAVE_ICONV
00216               iconv_close(conversion_descriptor);
00217 #endif
00218     char filename_openspdtd[255];
00219     char filename_dtd[255];
00220     char filename_ofx[255];
00221     strncpy(filename_openspdtd,find_dtd(OPENSPDCL_FILENAME).c_str(),255);//The opensp sgml dtd file
00222     if(libofx_context->currentFileType()==OFX)
00223       {
00224         strncpy(filename_dtd,find_dtd(OFX160DTD_FILENAME).c_str(),255);//The ofx dtd file
00225       }
00226     else if(libofx_context->currentFileType()==OFC)
00227       {
00228         strncpy(filename_dtd,find_dtd(OFCDTD_FILENAME).c_str(),255);//The ofc dtd file
00229       }
00230     else
00231       {
00232         message_out(ERROR,string("ofx_proc_file(): Error unknown file format for the OFX parser"));
00233       }
00234 
00235     if((string)filename_dtd!="" && (string)filename_openspdtd!="")
00236       {
00237         strncpy(filename_ofx,tmp_filename,255);//The processed ofx file
00238         filenames[0]=filename_openspdtd;
00239         filenames[1]=filename_dtd;
00240         filenames[2]=filename_ofx;
00241         if(libofx_context->currentFileType()==OFX)
00242           {
00243             ofx_proc_sgml(libofx_context, 3,filenames);
00244           }
00245         else if(libofx_context->currentFileType()==OFC)
00246           {
00247             ofc_proc_sgml(libofx_context, 3,filenames);
00248           }
00249         else
00250           {
00251             message_out(ERROR,string("ofx_proc_file(): Error unknown file format for the OFX parser"));
00252           }
00253         if(remove(tmp_filename)!=0)
00254           {
00255             message_out(ERROR,"ofx_proc_file(): Error deleting temporary file "+string(tmp_filename));
00256           }
00257       }
00258     else
00259       {
00260         message_out(ERROR,"ofx_proc_file(): FATAL: Missing DTD, aborting");
00261       }
00262   }
00263   else{
00264     message_out(ERROR,"ofx_proc_file():No input file specified");
00265   }
00266   return 0;
00267 }
00268 
00269 
00270 
00271 CFCT int libofx_proc_buffer(LibofxContextPtr ctx,
00272                             const char *s, unsigned int size){
00273   ofstream tmp_file;
00274   string s_buffer;
00275   char *filenames[3];
00276   char tmp_filename[50];
00277   int pos;
00278   LibofxContext *libofx_context;
00279 
00280   libofx_context=(LibofxContext*)ctx;
00281 
00282   if (size==0) {
00283     message_out(ERROR,
00284                 "ofx_proc_file(): bad size");
00285     return -1;
00286   }
00287   s_buffer=string(s, size);
00288 
00289   strncpy(tmp_filename,"/tmp/libofxtmpXXXXXX",50);
00290   mkstemp(tmp_filename);
00291   tmp_file.open(tmp_filename);
00292 
00293   message_out(DEBUG,"ofx_proc_file(): Creating temp file: "+string(tmp_filename));
00294   if(!tmp_file){
00295     message_out(ERROR,"ofx_proc_file():Unable to open the output file "+string(tmp_filename));
00296     return -1;
00297   }
00298 
00299   if (libofx_context->currentFileType()==OFX) {
00300     pos=s_buffer.find("<OFX>");
00301     if (pos==string::npos)
00302       pos=s_buffer.find("<ofx>");
00303   }
00304   else if (libofx_context->currentFileType()==OFC) {
00305     pos=s_buffer.find("<OFC>");
00306     if (pos==string::npos)
00307       pos=s_buffer.find("<ofc>");
00308   }
00309   else {
00310     message_out(ERROR,"ofx_proc(): unknown file type");
00311     return -1;
00312   }
00313   if (pos==string::npos || pos > s_buffer.size()) {
00314     message_out(ERROR,"ofx_proc():<OFX> has not been found");
00315     return -1;
00316   }
00317   else {
00318     // erase everything before the OFX tag
00319     s_buffer.erase(0, pos);
00320     message_out(DEBUG,"ofx_proc_file():<OF?> has been found");
00321   }
00322 
00323   if (libofx_context->currentFileType()==OFX) {
00324     pos=s_buffer.find("</OFX>");
00325     if (pos==string::npos)
00326       pos=s_buffer.find("</ofx>");
00327   }
00328   else if (libofx_context->currentFileType()==OFC) {
00329     pos=s_buffer.find("</OFC>");
00330     if (pos==string::npos)
00331       pos=s_buffer.find("</ofc>");
00332   }
00333   else {
00334     message_out(ERROR,"ofx_proc(): unknown file type");
00335     return -1;
00336   }
00337 
00338   if (pos==string::npos || pos > s_buffer.size()) {
00339     message_out(ERROR,"ofx_proc():</OF?> has not been found");
00340     return -1;
00341   }
00342   else {
00343     // erase everything after the /OFX tag
00344     if (s_buffer.size() > pos+6)
00345       s_buffer.erase(pos+6);
00346     message_out(DEBUG,"ofx_proc_file():<OFX> has been found");
00347   }
00348 
00349   s_buffer=sanitize_proprietary_tags(s_buffer);
00350   tmp_file.write(s_buffer.c_str(), s_buffer.length());
00351 
00352   tmp_file.close();
00353 
00354   char filename_openspdtd[255];
00355   char filename_dtd[255];
00356   char filename_ofx[255];
00357   strncpy(filename_openspdtd,find_dtd(OPENSPDCL_FILENAME).c_str(),255);//The opensp sgml dtd file
00358   if(libofx_context->currentFileType()==OFX){
00359     strncpy(filename_dtd,find_dtd(OFX160DTD_FILENAME).c_str(),255);//The ofx dtd file
00360   }
00361   else if(libofx_context->currentFileType()==OFC){
00362     strncpy(filename_dtd,find_dtd(OFCDTD_FILENAME).c_str(),255);//The ofc dtd file
00363   }
00364   else {
00365     message_out(ERROR,string("ofx_proc_file(): Error unknown file format for the OFX parser"));
00366   }
00367 
00368   if((string)filename_dtd!="" && (string)filename_openspdtd!=""){
00369     strncpy(filename_ofx,tmp_filename,255);//The processed ofx file
00370     filenames[0]=filename_openspdtd;
00371     filenames[1]=filename_dtd;
00372     filenames[2]=filename_ofx;
00373     if(libofx_context->currentFileType()==OFX){
00374       ofx_proc_sgml(libofx_context, 3,filenames);
00375     }
00376     else if(libofx_context->currentFileType()==OFC){
00377       ofc_proc_sgml(libofx_context, 3,filenames);
00378     }
00379     else {
00380       message_out(ERROR,string("ofx_proc_file(): Error unknown file format for the OFX parser"));
00381     }
00382     if(remove(tmp_filename)!=0){
00383       message_out(ERROR,"ofx_proc_file(): Error deleting temporary file "+string(tmp_filename));
00384     }
00385   }
00386   else {
00387     message_out(ERROR,"ofx_proc_file(): FATAL: Missing DTD, aborting");
00388   }
00389 
00390   return 0;
00391 }
00392 
00393 
00394 
00395 
00396 
00397 
00402 string sanitize_proprietary_tags(string input_string)
00403 {
00404   unsigned int i;
00405   size_t input_string_size;
00406   bool strip=false;
00407   bool tag_open=false;
00408   int tag_open_idx=0;//Are we within < > ?
00409   bool closing_tag_open=false;//Are we within </ > ?
00410   int orig_tag_open_idx=0;
00411   bool proprietary_tag=false; //Are we within a proprietary element?
00412   bool proprietary_closing_tag=false;
00413   int crop_end_idx=0;
00414   char buffer[READ_BUFFER_SIZE]="";
00415   char tagname[READ_BUFFER_SIZE]="";
00416   int tagname_idx=0;
00417   char close_tagname[READ_BUFFER_SIZE]="";
00418  
00419   for(i=0;i<READ_BUFFER_SIZE;i++){
00420     buffer[i]=0;
00421     tagname[i]=0;
00422     close_tagname[i]=0;
00423   }
00424   
00425   input_string_size=input_string.size();
00426   
00427   for(i=0;i<=input_string_size;i++){
00428     if(input_string.c_str()[i]=='<'){
00429       tag_open=true;
00430       tag_open_idx=i;
00431       if(proprietary_tag==true&&input_string.c_str()[i+1]=='/'){
00432         //We are now in a closing tag
00433         closing_tag_open=true;
00434         //cout<<"Comparaison: "<<tagname<<"|"<<&(input_string.c_str()[i+2])<<"|"<<strlen(tagname)<<endl;
00435         if(strncmp(tagname,&(input_string.c_str()[i+2]),strlen(tagname))!=0){
00436           //If it is the begining of an other tag
00437           //cout<<"DIFFERENT!"<<endl;
00438           crop_end_idx=i-1;
00439           strip=true;
00440         }
00441         else{
00442           //Otherwise, it is the start of the closing tag of the proprietary tag
00443           proprietary_closing_tag=true;
00444         }
00445       }
00446       else if(proprietary_tag==true){
00447         //It is the start of a new tag, following a proprietary tag
00448         crop_end_idx=i-1;
00449         strip=true;
00450       }
00451     }
00452     else if(input_string.c_str()[i]=='>'){
00453       tag_open=false;
00454       closing_tag_open=false;
00455       tagname[tagname_idx]=0;
00456       tagname_idx=0;
00457       if(proprietary_closing_tag==true){
00458         crop_end_idx=i;
00459         strip=true;
00460       }
00461     }
00462     else if(tag_open==true&&closing_tag_open==false){
00463       if(input_string.c_str()[i]=='.'){
00464         if(proprietary_tag!=true){
00465           orig_tag_open_idx = tag_open_idx;
00466           proprietary_tag=true;
00467         }
00468       }
00469       tagname[tagname_idx]=input_string.c_str()[i];
00470       tagname_idx++;
00471     }
00472     //cerr <<i<<endl;
00473     if(strip==true)
00474       {
00475         input_string.copy(buffer,(crop_end_idx-orig_tag_open_idx)+1,orig_tag_open_idx);
00476         message_out(INFO,"sanitize_proprietary_tags() (end tag or new tag) removed: "+string(buffer));
00477         input_string.erase(orig_tag_open_idx,(crop_end_idx-orig_tag_open_idx)+1);
00478         i=orig_tag_open_idx-1;
00479         proprietary_tag=false;
00480         proprietary_closing_tag=false;
00481         closing_tag_open=false;
00482         tag_open=false;
00483         strip=false;
00484       }
00485 
00486   }//end for
00487   if(proprietary_tag==true){
00488     if(crop_end_idx==0){//no closing tag
00489       crop_end_idx=input_string.size()-1;
00490     }
00491     input_string.copy(buffer,(crop_end_idx-orig_tag_open_idx)+1,orig_tag_open_idx);
00492     message_out(INFO,"sanitize_proprietary_tags() (end of line) removed: "+string(buffer));
00493     input_string.erase(orig_tag_open_idx,(crop_end_idx-orig_tag_open_idx)+1);
00494   }
00495   return input_string;
00496 }
00497 
00498 
00499 
00505 string find_dtd(string dtd_filename)
00506 {
00507   int i;
00508   ifstream dtd_file;
00509   string dtd_path_filename;
00510   bool dtd_found=false;
00511 
00512   for(i=0;i<DTD_SEARCH_PATH_NUM&&dtd_found==false;i++){
00513     dtd_path_filename=DTD_SEARCH_PATH[i];
00514     dtd_path_filename.append(dtd_filename);
00515     dtd_file.clear();
00516     dtd_file.open(dtd_path_filename.c_str());
00517     if(!dtd_file){
00518       message_out(DEBUG,"find_dtd():Unable to open the file "+dtd_path_filename);
00519     }
00520     else{
00521       message_out(STATUS,"find_dtd():DTD found: "+dtd_path_filename);
00522       dtd_file.close();
00523       dtd_found=true;
00524     }
00525   }
00526   if(dtd_found==false){
00527     message_out(ERROR,"find_dtd():Unable to find the DTD named " + dtd_filename);
00528     dtd_path_filename="";
00529   }
00530   return dtd_path_filename;
00531 }
00532 
00533 

Generated on Mon Nov 19 20:27:59 2007 for LibOFX by  doxygen 1.5.3