文库关键技术之PDF转换为HTML

Aspose.PDF的使用方法,官网上介绍的都有,但是都比较简单,主要是起示例作用。

结合自己实际工作,我对相关方法进行了整理,自认为比较关键的地方,会作简要说明。

 

用来转换成html之前,必须对filepath和folder进行定义,切记切记。

 

class PdfTools
{
    /// <summary>
    /// 源文件
    /// </summary>
    public string filepath { get; set; }

    /// <summary>
    /// 
    /// </summary>
    private string filename {
        get {
            string _filename = Path.GetFileName(filepath);
            _filename = _filename.Substring(0, _filename.IndexOf(.));
            return _filename;
        }
    }

    /// <summary>
    /// 最终文件目标夹
    /// </summary>
    public string folder { get; set; }

    public PdfTools() { }
    public PdfTools(string _filepath, string _folder)
    {
        filepath = _filepath;
        folder = _folder;

        folder = folder.Replace("/",@"\");
        if (folder.EndsWith(@"\") == false)
            folder = folder + @"\";
    }

    /// <summary>
    /// 将PDF文件转换成一个完整的HTML
    /// filepath为PDF源文件,必须将文件路径填写完整
    /// folder为最终保存目录,生成文件均在此目录下
    /// </summary>
    /// <returns></returns>
    public bool Pdf2Html_NoSplit()
    {
        //判断PDF文件是否存在
        if (File.Exists(filepath) == false)
            return false;

        folder = folder.Replace("/", @"\");
        if (folder.EndsWith(@"\") == false)
            folder = folder + @"\";

        if (Directory.Exists(folder) == true)
            Directory.Delete(folder,true);
        Directory.CreateDirectory(folder);

        Document doc = new Document(filepath);

        HtmlSaveOptions hso = new HtmlSaveOptions();
        hso.SplitIntoPages = false;
        hso.FixedLayout = true;

        hso.CompressSvgGraphicsIfAny = true;

//我偏向于所有资源存放在一个文件夹内,因此,需要重写下面三个函数 hso.CustomResourceSavingStrategy
= new HtmlSaveOptions.ResourceSavingStrategy(ResourceStrategy); hso.CustomCssSavingStrategy = new HtmlSaveOptions.CssSavingStrategy(CssStrategy); hso.CustomStrategyOfCssUrlCreation = new HtmlSaveOptions.CssUrlMakingStrategy(CssUrlStrategy); hso.FontSavingMode = HtmlSaveOptions.FontSavingModes.AlwaysSaveAsTTF; string outpath = folder + filename + ".html"; try { doc.Save(outpath, hso); return true; } catch { Directory.Delete(folder,true); return false; } } /// <summary> /// 根据PDF文件生成首页缩略图 /// </summary> /// <param name="_input">PDF源文件完整路径</param> /// <param name="_output">图片保存路径</param> public void CreateThumb(string _input,string _output) { if (File.Exists(_output) == true) File.Delete(_output); using (Document doc = new Document(_input)) using (FileStream imgstream = new FileStream(_output, FileMode.Create)) { //图片显示质量 Aspose.Pdf.Devices.Resolution resolution = new Aspose.Pdf.Devices.Resolution(100); Aspose.Pdf.Devices.PngDevice device = new Aspose.Pdf.Devices.PngDevice(resolution); device.Process(doc.Pages[1], imgstream); imgstream.Close(); } } /// <summary> /// 从PDF文件中提取正文 /// </summary> /// <param name="_input">PDF文件目录</param> /// <returns></returns> public string GetTxtFromPDF(string _input) { Document doc = new Document(_input); StringBuilder sb = new StringBuilder(); for (int i = 1; i <= doc.Pages.Count; i++) { Aspose.Pdf.Text.TextAbsorber ab = new Aspose.Pdf.Text.TextAbsorber(); ab.TextSearchOptions.LimitToPageBounds = true; try { doc.Pages[i].Accept(ab); sb.Append(ab.Text); } catch { return null; } } sb = sb.Replace("\r\n", " ").Replace(" "," "); string output = sb.ToString(); while (output.IndexOf(" ") >= 0) output = output.Replace(" "," "); return output; } /// <summary> /// 对字体、图片进行重命名,如果不用该函数,页面可能出现异常 /// </summary> /// <param name="resource"></param> /// <returns></returns> private string ResourceStrategy(SaveOptions.ResourceSavingInfo resource) { string resourcename = ""; if (resource.ResourceType == SaveOptions.NodeLevelResourceType.Image) resourcename = Guid.NewGuid().ToString() + Path.GetExtension(resource.SupposedFileName); else resourcename = resource.SupposedFileName; if (!Directory.Exists(folder)) Directory.CreateDirectory(folder); string outfile = folder + resourcename; if (File.Exists(resourcename) == true) return resourcename; System.IO.BinaryReader reader = new BinaryReader(resource.ContentStream); System.IO.File.WriteAllBytes(outfile, reader.ReadBytes((int)resource.ContentStream.Length)); return resourcename; } /// <summary> /// 生成页面引用的css文件 /// </summary> /// <param name="resource"></param> private void CssStrategy(HtmlSaveOptions.CssSavingInfo resource) { string path = folder+ filename + @".css"; BinaryReader reader = new BinaryReader(resource.ContentStream); File.WriteAllBytes(path, reader.ReadBytes((int)resource.ContentStream.Length)); } /// <summary> /// 对页面引用的css地址进行重写 /// </summary> /// <param name="resource"></param> /// <returns></returns> private string CssUrlStrategy(HtmlSaveOptions.CssUrlRequestInfo resource) { return filename + ".css"; } }

 

文库关键技术之PDF转换为HTML,古老的榕树,5-wow.com

郑重声明:本站内容如果来自互联网及其他传播媒体,其版权均属原媒体及文章作者所有。转载目的在于传递更多信息及用于网络分享,并不代表本站赞同其观点和对其真实性负责,也不构成任何其他建议。