Click here to Skip to main content
15,885,757 members
Please Sign up or sign in to vote.
0.00/5 (No votes)
Hello,

I am creating an XML file and it is successfully being created, but in the XML file, I am getting text like this:

Republican promises that the law will be repealed within months


Here, the block after Re is STX. Please refer to the image in the link below:

https://pasteboard.co/HlzdUOR.png

What I have tried:

private void btn_CreateArticles_Click(object sender, EventArgs e)
{
    this.Cursor = Cursors.WaitCursor;
    if (!string.IsNullOrEmpty(StructureFileName))
    {
        if (System.IO.File.Exists(StructureFileName))
        {
            string UnpackDirectory = "";
            string UnpackFile = "";
            UnpackDirectory = System.IO.Path.GetDirectoryName(StructureFileName);
            UnpackFile = System.IO.Path.GetFileNameWithoutExtension(StructureFileName);
            string EpubFolder = "";
            EpubFolder = UnpackDirectory + "\\" + UnpackFile + "_Epub";
            if (!(System.IO.Directory.Exists(EpubFolder)))
            {
                System.IO.Directory.CreateDirectory(EpubFolder);
            }
            if (!(System.IO.Directory.Exists(EpubFolder + "\\OPS")))
            {
                System.IO.Directory.CreateDirectory(EpubFolder + "\\OPS");
            }
            string StrBooKTitle = "";
            StrBooKTitle = txt_BookTitle.Text;
            if (Regex.IsMatch(StrBooKTitle, "\n" + "|" + "\r", RegexOptions.Multiline))
            {
                StrBooKTitle = Regex.Replace(StrBooKTitle, "\n" + "|" + "\r", "", RegexOptions.Multiline);
            }
            //Use Replacer Function here on StrBookTitle

            if (!string.IsNullOrEmpty(StrBooKTitle.Trim(' ')))
            {
                if (DataGridView1.RowCount > 0)
                {
                    string ArticleHeadStr = "";
                    ArticleHeadStr = ArticleHeadStr + "<?xml version=\"1.0\" encoding=\"utf-8\" standalone=\"no\"?>" + Environment.NewLine;
                    ArticleHeadStr = ArticleHeadStr + "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\" \"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" + Environment.NewLine;
                    ArticleHeadStr = ArticleHeadStr + "<html xmlns:saxon=\"http://saxon.sf.net/\" xmlns=\"http://www.w3.org/1999/xhtml\" xmlns:nitf=\"http://www.nytimes.com/applicationdata/xml/nitf-3-3.dtd\">" + Environment.NewLine;
                    ArticleHeadStr = ArticleHeadStr + "<head>" + Environment.NewLine;
                    ArticleHeadStr = ArticleHeadStr + "<title>" + StrBooKTitle + "</title>" + Environment.NewLine;
                    ArticleHeadStr = ArticleHeadStr + "<link rel=\"stylesheet\" href=\"css/TablesAndFloats.css\" type=\"text/css\"/>" + Environment.NewLine;
                    ArticleHeadStr = ArticleHeadStr + "</head>" + Environment.NewLine;
                    ArticleHeadStr = ArticleHeadStr + "<body>" + Environment.NewLine;
                    ArticleHeadStr = ArticleHeadStr + "<div class=\"clean\"/>" + Environment.NewLine;
                    ArticleHeadStr = ArticleHeadStr + "<div id=\"header\" class=\"masthead\">" + Environment.NewLine;
                    ArticleHeadStr = ArticleHeadStr + "<div class=\"masthead-text\">" + Environment.NewLine;
                    ArticleHeadStr = ArticleHeadStr + "<div id=\"header_title\" class=\"masthead-section\">" + Environment.NewLine;
                    ArticleHeadStr = ArticleHeadStr + "</div>" + Environment.NewLine;
                    ArticleHeadStr = ArticleHeadStr + "</div>" + Environment.NewLine;
                    ArticleHeadStr = ArticleHeadStr + "</div>" + Environment.NewLine;
                    ArticleHeadStr = ArticleHeadStr + "<div>" + Environment.NewLine;
                    string ArticleFootStr = "";
                    ArticleFootStr = Environment.NewLine + "</div>" + Environment.NewLine + "</body>" + Environment.NewLine + "</html>";
                    string TempArticleHeadStr = "";
                    string TempArticleStr = "";
                    string TempArticleFileName = "";
                    int ArticleGreaterFileName = 0;
                    for (var irow = 0; irow < DataGridView1.RowCount; irow++)
                    {
                        if (DataGridView1.Rows[irow].Cells[0].Value != null)
                        {
                            if ((Encoding.Unicode.GetByteCount(TempArticleStr) / 1024.0) > 270)
                            {
                                TempArticleStr = ArticleHeadStr + TempArticleStr + ArticleFootStr;
                                ArticleGreaterFileName = ArticleGreaterFileName + 1;
                                string STempArticleFileName = TempArticleFileName.Replace(".xml", ArticleGreaterFileName + ".xml");
                                TempArticleStr = EssentialTextReplace(TempArticleStr).ToString();
                                Common.WriteFile(STempArticleFileName, TempArticleStr);
                                TempArticleStr = "";
                            }
                            if ((string)DataGridView1.Rows[irow].Cells[2].Value == "article-full-headline")
                            {
                                if (!string.IsNullOrEmpty(TempArticleStr))
                                {
                                    TempArticleStr = TempArticleHeadStr + TempArticleStr + ArticleFootStr;
                                    //EssentialTextReplace
                                    TempArticleStr = EssentialTextReplace(TempArticleStr).ToString();

                                    if (ArticleGreaterFileName == 0)
                                    {
                                        Common.WriteFile(TempArticleFileName, TempArticleStr);
                                    }
                                    else
                                    {
                                        ArticleGreaterFileName = ArticleGreaterFileName + 1;
                                        string STempArticleFileName = TempArticleFileName.Replace(".xml", ArticleGreaterFileName + ".xml");
                                        Common.WriteFile(STempArticleFileName, TempArticleStr);
                                    }
                                    TempArticleStr = "";
                                    ArticleGreaterFileName = 0;
                                }
                                TempArticleFileName = EpubFolder + "\\OPS\\article_" + Convert.ToString(DataGridView1.Rows[irow].Cells[0].Value).Trim(' ') + "-" + Convert.ToString(DataGridView1.Rows[irow].Cells[1].Value).Trim(' ') + ".xml";
                                TempArticleHeadStr = ArticleHeadStr;
                                if ((string)DataGridView1.Rows[irow].Cells[4].Value != "")
                                {
                                    TempArticleHeadStr = TempArticleHeadStr.Replace("@@@", Convert.ToString(DataGridView1.Rows[irow].Cells[4].Value).Trim(' '));
                                }
                                if ((string)DataGridView1.Rows[irow].Cells[3].Value != "")
                                {
                                    if (string.IsNullOrEmpty(TempArticleStr))
                                    {
                                        TempArticleStr = TempArticleStr + Convert.ToString(DataGridView1.Rows[irow].Cells[3].Value).Trim(' ');
                                    }
                                    else
                                    {
                                        TempArticleStr = TempArticleStr + Environment.NewLine + Convert.ToString(DataGridView1.Rows[irow].Cells[3].Value).Trim(' ');
                                    }
                                }
                            }
                            else if ((string)DataGridView1.Rows[irow].Cells[2].Value == "sectionName")
                            {
                                continue;
                            }
                            else if ((string)DataGridView1.Rows[irow].Cells[2].Value == "articleImageCaption")
                            {
                                var test = true;
                                if ((string)DataGridView1.Rows[irow].Cells[3].Value != "")
                                {
                                    string Tstr = Convert.ToString(DataGridView1.Rows[irow].Cells[3].Value).Trim(' ');
                                    string tsearchStr = "@" + DataGridView1.Rows[irow].Cells[0].Value + "_" + DataGridView1.Rows[irow].Cells[10].Value + "_caption";
                                    if (TempArticleStr.Contains(tsearchStr))
                                    {
                                        TempArticleStr = TempArticleStr.Replace(tsearchStr, Tstr);
                                    }
                                }
                            }
                            else
                            {
                                if ((string)DataGridView1.Rows[irow].Cells[3].Value != "")
                                {
                                    if (string.IsNullOrEmpty(TempArticleStr))
                                    {
                                        TempArticleStr = TempArticleStr + Convert.ToString(DataGridView1.Rows[irow].Cells[3].Value).Trim(' ');
                                    }
                                    else
                                    {
                                        TempArticleStr = TempArticleStr + Environment.NewLine + Convert.ToString(DataGridView1.Rows[irow].Cells[3].Value).Trim(' ');
                                    }
                                }
                            }
                        }
                    }
                    if (!string.IsNullOrEmpty(TempArticleStr))
                    {
                        TempArticleStr = TempArticleHeadStr + TempArticleStr + ArticleFootStr;
                        TempArticleStr = EssentialTextReplace(TempArticleStr).ToString();

                        if (ArticleGreaterFileName == 0)
                        {
                            Common.WriteFile(TempArticleFileName, TempArticleStr);
                        }
                        else
                        {
                            ArticleGreaterFileName = ArticleGreaterFileName + 1;
                            string STempArticleFileName = TempArticleFileName.Replace(".xml", ArticleGreaterFileName + ".xml");
                            Common.WriteFile(STempArticleFileName, TempArticleStr);
                        }
                        TempArticleStr = "";
                        ArticleGreaterFileName = 0;
                    }
                }
                btn_CreateArticles.ForeColor = System.Drawing.Color.Red;
                MessageBox.Show("Completed Articles", "Epub Articles", MessageBoxButtons.OK, MessageBoxIcon.Information);
            }
            else
            {
                MessageBox.Show("Book Title Blank", "Error Epub Articles", MessageBoxButtons.OK, MessageBoxIcon.Error);
            }
        }
        else
        {
            MessageBox.Show("Structure File Path Not Correct", "Error Epub Articles", MessageBoxButtons.OK, MessageBoxIcon.Error);
        }
    }
    else
    {
        MessageBox.Show("Structure File Not Loaded", "Error Epub Articles", MessageBoxButtons.OK, MessageBoxIcon.Error);
    }
    this.Cursor = Cursors.Default;
}
Posted
Updated 16-May-18 22:29pm

1 solution

STX is a control character[^]. You can remove those from a string like so:

var stringWithoutControlCharacters = new string(originalString.Where(c => !char.IsControl(c)).ToArray());
 
Share this answer
 
Comments
Primo Chalice 17-May-18 4:57am    
Now this is the code to convert the xml to pdf and doc:

Document mydoc = new Document();
mydoc.LoadFromFile(@"C:/Users/amanc/Desktop/SAMPLE_1-2_Epub/OPS/article_1-1.xml", FileFormat.Xml);
mydoc.SaveToFile("article_1-1.doc", FileFormat.Doc);
mydoc.SaveToFile("article_1-1.pdf", FileFormat.PDF);

and I am getting an error here:
mydoc.LoadFromFile(@"C:/Users/amanc/Desktop/SAMPLE_1-2_Epub/OPS/article_1-1.xml", FileFormat.Xml);

The error is:
System.Xml.XmlException: ''', hexadecimal value 0x02, is an invalid character. Line 23, position 244.'. Again STX.
[no name] 17-May-18 5:00am    
0x02 is the hex representation of the control character STX. Control characters are not allowed in XML; you should filter them out during the creation of the XML.
Primo Chalice 17-May-18 5:01am    
I actually did implement it. Using the code that you suggested.

I might have done something wrong in its placement. Can you please tell me where in the code shall I insert it? It would be really helpful.
[no name] 17-May-18 5:13am    
Before you write the file with Common.WriteFile, you do a call to EssentialTextReplace(). The code for this is missing, but I presume that is intended for cleaning up the XML? That's where I would expect the replacement. Could you post this method maybe?
Primo Chalice 17-May-18 5:28am    
private object EssentialTextReplace(string TextStr)
{
string NewTextStr = "";
NewTextStr = TextStr;
if (Regex.IsMatch(NewTextStr, "@\\d+_\\d+_caption", RegexOptions.Multiline))
{
var test = true;
NewTextStr = Regex.Replace(NewTextStr, "@\\d+_\\d+_caption", "", RegexOptions.Multiline);
}
if (Regex.IsMatch(NewTextStr, "\\t", RegexOptions.Multiline))
{
NewTextStr = Regex.Replace(NewTextStr, "\\t", "", RegexOptions.Multiline);
}
if (NewTextStr.Contains("\\r\\n"))
{
NewTextStr = NewTextStr.Replace("\\r\\n", Environment.NewLine);
}
if (NewTextStr.Contains("<p"))
{
NewTextStr = NewTextStr.Replace("<p", "" + Environment.NewLine + "<p");
}

if (NewTextStr.Contains(""))
{
NewTextStr = NewTextStr.Replace("", "<span class=\"ld_bold\">");
}
if (NewTextStr.Contains("
"))
{
NewTextStr = NewTextStr.Replace("
", "");
}
if (NewTextStr.Contains(""))
{
NewTextStr = NewTextStr.Replace("", "<span class=\"ld_italic\">");
}
if (NewTextStr.Contains("
"))
{
NewTextStr = NewTextStr.Replace("
", "");
}
if (NewTextStr.Contains(""))
{
NewTextStr = NewTextStr.Replace("", "<span class=\"ld_subscript\">");
}
if (NewTextStr.Contains("
"))
{
NewTextStr = NewTextStr.Replace("
", "");
}
if (NewTextStr.Contains(""))
{
NewTextStr = NewTextStr.Replace("", "<span class=\"ld_superscript\">");
}
if (NewTextStr.Contains("
"))
{
NewTextStr = NewTextStr.Replace("
", "");
}
if (NewTextStr.Contains(""))
{
NewTextStr = NewTextStr.Replace("", "<span class=\"ld_underline\">");
}
if (NewTextStr.Contains("
"))
{
NewTextStr = NewTextStr.Replace("
", "");
}
if (NewTextStr.Contains(""))
{
NewTextStr = NewTextStr.Replace("", "<span class=\"ld_strikthrough\">");
}
if (NewTextStr.Contains("
"))
{
NewTextStr = NewTextStr.Replace("
", "");
}
//Use Replacer Function here on NewTextStr
return NewTextStr;
}

This is the EssentialTextReplace() function.

This content, along with any associated source code and files, is licensed under The Code Project Open License (CPOL)



CodeProject, 20 Bay Street, 11th Floor Toronto, Ontario, Canada M5J 2N8 +1 (416) 849-8900