栏目分类:
子分类:
返回
名师互学网用户登录
快速导航关闭
当前搜索
当前分类
子分类
实用工具
热门搜索
名师互学网 > IT > 面试经验 > 面试问答

如何从Word文件.doc,docx,.xlsx,.pptx php中提取文本

面试问答 更新时间: 发布时间: IT归档 最新发布 模块sitemap 名妆网 法律咨询 聚返吧 英语巴士网 伯小乐 网商动力

如何从Word文件.doc,docx,.xlsx,.pptx php中提取文本

class DocxConversion{
private $filename;

    public function __construct($filePath) {        $this->filename = $filePath;    }    private function read_doc() {        $fileHandle = fopen($this->filename, "r");        $line = @fread($fileHandle, filesize($this->filename));$lines = explode(chr(0x0D),$line);        $outtext = "";        foreach($lines as $thisline)          { $pos = strpos($thisline, chr(0x00)); if (($pos !== FALSE)||(strlen($thisline)==0))   {   } else {     $outtext .= $thisline." ";   }          }         $outtext = preg_replace("/[^a-zA-Z0-9s,.-nrt@/_()]/","",$outtext);        return $outtext;    }    private function read_docx(){        $striped_content = '';        $content = '';        $zip = zip_open($this->filename);        if (!$zip || is_numeric($zip)) return false;        while ($zip_entry = zip_read($zip)) { if (zip_entry_open($zip, $zip_entry) == FALSE) continue; if (zip_entry_name($zip_entry) != "word/document.xml") continue; $content .= zip_entry_read($zip_entry, zip_entry_filesize($zip_entry)); zip_entry_close($zip_entry);        }// end while        zip_close($zip);        $content = str_replace('</w:r></w:p></w:tc><w:tc>', " ", $content);        $content = str_replace('</w:r></w:p>', "rn", $content);        $striped_content = strip_tags($content);        return $striped_content;    } function xlsx_to_text($input_file){    $xml_filename = "xl/sharedStrings.xml"; //content file name    $zip_handle = new ZipArchive;    $output_text = "";    if(true === $zip_handle->open($input_file)){        if(($xml_index = $zip_handle->locateName($xml_filename)) !== false){ $xml_datas = $zip_handle->getFromIndex($xml_index); $xml_handle = DOMdocument::loadXML($xml_datas, LIBXML_NOENT | LIBXML_XINCLUDE | LIBXML_NOERROR | LIBXML_NOWARNING); $output_text = strip_tags($xml_handle->saveXML());        }else{ $output_text .="";        }        $zip_handle->close();    }else{    $output_text .="";    }    return $output_text;}function pptx_to_text($input_file){    $zip_handle = new ZipArchive;    $output_text = "";    if(true === $zip_handle->open($input_file)){        $slide_number = 1; //loop through slide files        while(($xml_index = $zip_handle->locateName("ppt/slides/slide".$slide_number.".xml")) !== false){ $xml_datas = $zip_handle->getFromIndex($xml_index); $xml_handle = DOMdocument::loadXML($xml_datas, LIBXML_NOENT | LIBXML_XINCLUDE | LIBXML_NOERROR | LIBXML_NOWARNING); $output_text .= strip_tags($xml_handle->saveXML()); $slide_number++;        }        if($slide_number == 1){ $output_text .="";        }        $zip_handle->close();    }else{    $output_text .="";    }    return $output_text;}    public function convertToText() {        if(isset($this->filename) && !file_exists($this->filename)) { return "File Not exists";        }        $fileArray = pathinfo($this->filename);        $file_ext  = $fileArray['extension'];        if($file_ext == "doc" || $file_ext == "docx" || $file_ext == "xlsx" || $file_ext == "pptx")        { if($file_ext == "doc") {     return $this->read_doc(); } elseif($file_ext == "docx") {     return $this->read_docx(); } elseif($file_ext == "xlsx") {     return $this->xlsx_to_text(); }elseif($file_ext == "pptx") {     return $this->pptx_to_text(); }        } else { return "Invalid File Type";        }    }}

document_file_formatDoc文件是二进制blob。可以使用[fopen读取它们。虽然.docx文件只是zip文件和xml文件zipfile容器中的xml文件(源Wikipedia),您可以使用zip_open读取它们。

以上类的用法

$docObj = new DocxConversion("test.doc");//$docObj = new DocxConversion("test.docx");//$docObj = new DocxConversion("test.xlsx");//$docObj = new DocxConversion("test.pptx");echo $docText= $docObj->convertToText();


转载请注明:文章转载自 www.mshxw.com
本文地址:https://www.mshxw.com/it/435284.html
我们一直用心在做
关于我们 文章归档 网站地图 联系我们

版权所有 (c)2021-2022 MSHXW.COM

ICP备案号:晋ICP备2021003244-6号