cleanup and refactoring; renamed TextRun to TagPair

sfperusacdev · Feb 2, 2021 · fddd06c · fddd06c
1 parent 0566f78
commit fddd06c
Show file tree

Hide file tree

Showing 8 changed files with 155 additions and 131 deletions.
diff --git a/Makefile b/Makefile
@@ -2,3 +2,6 @@
 test:
  @go test -v .
 
+gofmt:
+ @gofmt -w *.go
+
diff --git a/README.md b/README.md
@@ -103,12 +103,11 @@ To not cause too much confusion, here is a list of terms which you might come ac
 
 * **Parser**: Every file which this lib handles (document, footers and headers) has their own parser attached since everything is relative to the underlying byte-slice (aka. file).
 * **Position**: A Position is just a `Start` and `End` offset, relative to the byte slice of the document of a parser.
-* **Run**: Describes the pair `<w:r>` and `</w:r>` and thus has two `Positions` for the open and close tag. Since they are Positions, they have a `Start` and `End` Position which point to `<` and `>` of the tag. A run also consists of a `TextRun`.
-* **TextRun**: Is always nested inside a run and describes the tags `<w:t>` and `</w:t>`. It also just consists of two `Positions`. The type was just created for clarity and does not have special functionality.
+* **Run**: Describes the pair `<w:r>` and `</w:r>` and thus has two `Positions` for the open and close tag. Since they are Positions, they have a `Start` and `End` Position which point to `<` and `>` of the tag. A run also consists of a `TagPair`.
 
 * **Placeholder**: A Placeholder is basically just a list of `PlaceholderFragments` representing a full placeholder extracted by a `Parser`.
 * **PlaceholderFragment**: A PlaceholderFragment is a parsed fragment of a placeholder since those will most likely be ripped apart by WordprocessingML. The Placeholder `{foo-bar-baz}` might ultimately consist of 5 fragments ( `{`, `foo-`, `bar-`, `baz`, `}`).
-The fragment is at the heart of replacing. It knows to which `Run` it belongs to and has methods of manipulating these byte-offsets. Additionally it has a `Position` which describes the offset inside the `TextRun` since the fragments don't always start at the beginning of one (e.g. `<w:t>some text {fragment-start</w:t>`)
+The fragment is at the heart of replacing. It knows to which `Run` it belongs to and has methods of manipulating these byte-offsets. Additionally it has a `Position` which describes the offset inside the `TagPair` since the fragments don't always start at the beginning of one (e.g. `<w:t>some text {fragment-start</w:t>`)
 
 ### ➤ How it works
 This section will give you a short overview of what's actually going on.

diff --git a/document.go b/document.go
@@ -83,7 +83,7 @@ func OpenBytes(b []byte) (*Document, error) {
 // newDocument will create a new document struct given the zipFile.
 // The params 'path' and 'docxFile' may be empty/nil in case the document is created from a byte source directly.
 //
-// newDocument will parse the docx archive and ValidateRuns that at least a 'document.xml' exists.
+// newDocument will parse the docx archive and ValidatePositions that at least a 'document.xml' exists.
 // If 'word/document.xml' is missing, an error is returned since the docx cannot be correct.
 // Then all files are parsed for their runs before returning the new document.
 func newDocument(zipFile *zip.Reader, path string, docxFile *os.File) (*Document, error) {
@@ -416,10 +416,6 @@ func (fm FileMap) Write(writer io.Writer, filename string) error {
  return fmt.Errorf("file not found %s", filename)
  }
 
- // cleanup the file in order to solve known compatibility issues
- // MS Word will not open the document if a file contains a singleton text <w:t/>, thus they need to be removed
- //file = []byte(strings.Replace(string(file), "<w:t/>", "", -1))
-
  _, err := writer.Write(file)
  if err != nil && err != io.EOF {
  return fmt.Errorf("unable to writeFile '%s': %s", filename, err)

diff --git a/parse.go b/parse.go
@@ -24,14 +24,14 @@ var (
  RunCloseTagRegex = regexp.MustCompile(`(</w:r>)`)
  // RunSingletonTagRegex matches a singleton run tag
  RunSingletonTagRegex = regexp.MustCompile(`(<w:r/>)`)
- // TextRunOpenTagRegex matches all OpenTags for text-runs, including eventually set attributes
- TextRunOpenTagRegex = regexp.MustCompile(`(<w:t).*>`)
- // TextRunCloseTagRegex matches the close tag of text-runs
- TextRunCloseTagRegex = regexp.MustCompile(`(</w:t>)`)
- // ErrParsingFailed is returned if the parsing failed and the result cannot be used.
+ // TextOpenTagRegex matches all OpenTags for text-runs, including eventually set attributes
+ TextOpenTagRegex = regexp.MustCompile(`(<w:t).*>`)
+ // TextCloseTagRegex matches the close tag of text-runs
+ TextCloseTagRegex = regexp.MustCompile(`(</w:t>)`)
+ // ErrTagsInvalid is returned if the parsing failed and the result cannot be used.
  // Typically this means that one or more tag-offsets were not parsed correctly which
  // would cause the document to become corrupted as soon as replacing starts.
- ErrParsingFailed = errors.New("failed to parse the document, cannot continue")
+ ErrTagsInvalid = errors.New("one or more tags are invalid and will cause the XML to be corrupt")
 )
 
 // RunParser can parse a list of Runs from a given byte slice.
@@ -63,53 +63,14 @@ func (parser *RunParser) Execute() error {
  return err
  }
 
- return ValidateRuns(parser.doc, parser.runs)
+ return ValidatePositions(parser.doc, parser.runs)
 }
 
 // Runs returns the all runs found by the parser.
 func (parser *RunParser) Runs() DocumentRuns {
  return parser.runs
 }
 
-// ValidateRuns will iterate over all runs and their texts (if any) and ensure that they match
-// their respective regex.
-// If the validation failed, the replacement will not work since offsets are wrong.
-func ValidateRuns(document []byte, runs []*Run) error {
- parsingFailed := false
- for _, run := range runs {
-
- // singleton tags must not be validated
- if RunSingletonTagRegex.MatchString(string(document[run.OpenTag.Start:run.OpenTag.End])) {
- continue
- }
-
- if !RunOpenTagRegex.MatchString(string(document[run.OpenTag.Start:run.OpenTag.End])) {
- log.Println("RunOpenTagRegex failed to match", run.String(document))
- parsingFailed = true
- }
- if !RunCloseTagRegex.MatchString(string(document[run.CloseTag.Start:run.CloseTag.End])) {
- log.Println("RunCloseTagRegex failed to match", run.String(document))
- parsingFailed = true
- }
-
- if run.HasText {
- if !TextRunOpenTagRegex.MatchString(string(document[run.Text.StartTag.Start:run.Text.StartTag.End])) {
- log.Println("TextRunOpenTagRegex failed to match", run.String(document))
- parsingFailed = true
- }
- if !TextRunCloseTagRegex.MatchString(string(document[run.Text.EndTag.Start:run.Text.EndTag.End])) {
- log.Println("TextRunCloseTagRegex failed to match", run.String(document))
- parsingFailed = true
- }
- }
- }
- if parsingFailed {
- return ErrParsingFailed
- }
-
- return nil
-}
-
 // FindRuns will search through the document and return all runs found.
 // The text tags are not analyzed at this point, that'str the next step.
 func (parser *RunParser) findRuns() error {
@@ -120,8 +81,8 @@ func (parser *RunParser) findRuns() error {
  tmpRun := NewEmptyRun()
  singleton := false
 
- // nestCount holds the nesting-level. It is going to be incremented on every StartTag and decremented
- // on every EndTag.
+ // nestCount holds the nesting-level. It is going to be incremented on every OpenTag and decremented
+ // on every CloseTag.
  nestCount := 0
 
  // popRun will pop the last Run from the runStack if there is any on the stack
@@ -132,7 +93,7 @@ func (parser *RunParser) findRuns() error {
  }
 
  // nextIteration resets the temporary values used inside the for-loop to be ready for the next iteration
- // This is used after a run has been fully analyzed (StartTag and EndTag were found).
+ // This is used after a run has been fully analyzed (OpenTag and CloseTag were found).
  // As long as there are runs on the runStack, they will be popped from it.
  // Only when the stack is empty, a new empty Run struct is created.
  nextIteration := func() {
@@ -186,7 +147,7 @@ func (parser *RunParser) findRuns() error {
  if elem.Name.Local == RunElementName {
 
  // if the run is a singleton tag, it was already identified by the xml.StartElement case
- // in that case, the EndTag is the same as the openTag and no further work needs to be done
+ // in that case, the CloseTag is the same as the openTag and no further work needs to be done
  if singleton {
  tmpRun.CloseTag = tmpRun.OpenTag
  parser.runs = append(parser.runs, tmpRun) // run is finished
@@ -213,7 +174,7 @@ func (parser *RunParser) findRuns() error {
 
  if nestCount != 0 {
  log.Printf("invalid nestCount, should be 0 but is %d\n", nestCount)
- return ErrParsingFailed
+ return ErrTagsInvalid
  }
 
  return nil
@@ -257,7 +218,7 @@ func (parser *RunParser) findTextRuns() error {
  return fmt.Errorf("unable to find currentRun for text start-element")
  }
  currentRun.HasText = true
- currentRun.Text.StartTag = Position{
+ currentRun.Text.OpenTag = Position{
  Start: tagStartPos,
  End: tagEndPos,
  }
@@ -275,7 +236,7 @@ func (parser *RunParser) findTextRuns() error {
  if currentRun == nil {
  return fmt.Errorf("unable to find currentRun for text end-element")
  }
- currentRun.Text.EndTag = Position{
+ currentRun.Text.CloseTag = Position{
  Start: tagStartPos,
  End: tagEndPos,
  }
@@ -297,22 +258,56 @@ func (parser *RunParser) findOpenBracketPos(endBracketPos int64) int64 {
  return 0
 }
 
-// TagPosition returns a filled Position struct given the end position and the tag itself.
-func TagPosition(endPos int64, tag string) (tp Position) {
- tp.End = endPos
- tp.Start = endPos - int64(len(tag))
- return tp
-}
+// ValidatePositions will iterate over all runs and their texts (if any) and ensure that they match
+// their respective regex.
+// If the validation failed, the replacement will not work since offsets are wrong.
+func ValidatePositions(document []byte, runs []*Run) error {
+ parsingFailed := false
+ for _, run := range runs {
+
+ // singleton tags must not be validated
+ if run.OpenTag.Match(RunSingletonTagRegex, document) {
+ continue
+ }
+
+ if !run.OpenTag.Match(RunOpenTagRegex, document) {
+ log.Println("RunOpenTagRegex failed to match", run.String(document))
+ parsingFailed = true
+ }
+ if !run.CloseTag.Match(RunCloseTagRegex, document) {
+ log.Println("RunCloseTagRegex failed to match", run.String(document))
+ parsingFailed = true
+ }
 
-// TextRun defines the <w:t> element which contains the actual literal text data.
-// A TextRun is always a child of a Execute.
-type TextRun struct {
- StartTag Position
- EndTag Position
+ if run.HasText {
+ if !run.Text.OpenTag.Match(TextOpenTagRegex, document) {
+ log.Println("TextOpenTagRegex failed to match", run.String(document))
+ parsingFailed = true
+ }
+ if !run.Text.CloseTag.Match(TextCloseTagRegex, document) {
+ log.Println("TextCloseTagRegex failed to match", run.String(document))
+ parsingFailed = true
+ }
+ }
+ }
+ if parsingFailed {
+ return ErrTagsInvalid
+ }
+
+ return nil
 }
 
 // Position is a generic position of a tag, represented by byte offsets
 type Position struct {
  Start int64
  End int64
 }
+
+// Match will apply a MatchString using the given regex on the given data and returns true if the position
+// matches the regex inside the data.
+func (p Position) Match(regexp *regexp.Regexp, data []byte) bool {
+ if !regexp.MatchString(string(data[p.Start:p.End])) {
+ return false
+ }
+ return true
+}
diff --git a/placeholder.go b/placeholder.go
@@ -34,7 +34,7 @@ type Placeholder struct {
 func (p Placeholder) Text(docBytes []byte) string {
  str := ""
  for _, fragment := range p.Fragments {
- s := fragment.Run.Text.StartTag.End
+ s := fragment.Run.Text.OpenTag.End
  t := docBytes[s+fragment.Position.Start : s+fragment.Position.End]
  str += string(t)
  }
@@ -43,13 +43,13 @@ func (p Placeholder) Text(docBytes []byte) string {
 
 // StartPos returns the absolute start position of the placeholder.
 func (p Placeholder) StartPos() int64 {
- return p.Fragments[0].Run.Text.StartTag.End + p.Fragments[0].Position.Start
+ return p.Fragments[0].Run.Text.OpenTag.End + p.Fragments[0].Position.Start
 }
 
 // EndPos returns the absolute end position of the placeholder.
 func (p Placeholder) EndPos() int64 {
  end := len(p.Fragments) - 1
- return p.Fragments[end].Run.Text.StartTag.End + p.Fragments[end].Position.End
+ return p.Fragments[end].Run.Text.OpenTag.End + p.Fragments[end].Position.End
 }
 
 // ParsePlaceholders will, given the document run positions and the bytes, parse out all placeholders including

diff --git a/placeholder_fragment.go b/placeholder_fragment.go
@@ -38,10 +38,10 @@ func (p *PlaceholderFragment) ShiftAll(deltaLength int64) {
  p.Run.OpenTag.End += deltaLength
  p.Run.CloseTag.Start += deltaLength
  p.Run.CloseTag.End += deltaLength
- p.Run.Text.StartTag.Start += deltaLength
- p.Run.Text.StartTag.End += deltaLength
- p.Run.Text.EndTag.Start += deltaLength
- p.Run.Text.EndTag.End += deltaLength
+ p.Run.Text.OpenTag.Start += deltaLength
+ p.Run.Text.OpenTag.End += deltaLength
+ p.Run.Text.CloseTag.Start += deltaLength
+ p.Run.Text.CloseTag.End += deltaLength
 }
 
 // ShiftCut will shift the fragment position markers in such a way that the fragment can be considered empty.
@@ -52,8 +52,8 @@ func (p *PlaceholderFragment) ShiftAll(deltaLength int64) {
 // If that data was removed from the document, the positions (not all positions) of the fragment need to be adjusted.
 // The text positions are set equal (start == end).
 func (p *PlaceholderFragment) ShiftCut(cutLength int64) {
- p.Run.Text.EndTag.Start -= cutLength
- p.Run.Text.EndTag.End -= cutLength
+ p.Run.Text.CloseTag.Start -= cutLength
+ p.Run.Text.CloseTag.End -= cutLength
  p.Run.CloseTag.Start -= cutLength
  p.Run.CloseTag.End -= cutLength
  p.Position.End = p.Position.Start
@@ -64,21 +64,21 @@ func (p *PlaceholderFragment) ShiftCut(cutLength int64) {
 // For example, the fragment text was 'placeholder' (11 bytes) which is replaced with 'a-super-awesome-value' (21 bytes)
 // In that case the deltaLength would be 10. In order to accommodate for the change in bytes you'd need to call ShiftReplace(10)
 func (p *PlaceholderFragment) ShiftReplace(deltaLength int64) {
- p.Run.Text.EndTag.Start += deltaLength
- p.Run.Text.EndTag.End += deltaLength
+ p.Run.Text.CloseTag.Start += deltaLength
+ p.Run.Text.CloseTag.End += deltaLength
  p.Run.CloseTag.Start += deltaLength
  p.Run.CloseTag.End += deltaLength
  p.Position.End += deltaLength
 }
 
 // StartPos returns the absolute start position of the fragment.
 func (p PlaceholderFragment) StartPos() int64 {
- return p.Run.Text.StartTag.End + p.Position.Start
+ return p.Run.Text.OpenTag.End + p.Position.Start
 }
 
 // EndPos returns the absolute end position of the fragment.
 func (p PlaceholderFragment) EndPos() int64 {
- return p.Run.Text.StartTag.End + p.Position.End
+ return p.Run.Text.OpenTag.End + p.Position.End
 }
 
 // Text returns the actual text of the fragment given the source bytes.
@@ -100,7 +100,7 @@ func (p PlaceholderFragment) TextLength(docBytes []byte) int64 {
 func (p PlaceholderFragment) String(docBytes []byte) string {
  format := "fragment %d in %s with fragment text-positions: [%d:%d] '%s'"
  return fmt.Sprintf(format, p.ID, p.Run.String(docBytes),
- p.Position.Start, p.Position.End, docBytes[p.Run.Text.StartTag.End+p.Position.Start:p.Run.Text.StartTag.End+p.Position.End])
+ p.Position.Start, p.Position.End, docBytes[p.Run.Text.OpenTag.End+p.Position.Start:p.Run.Text.OpenTag.End+p.Position.End])
 }
 
 // NewFragmentID returns the next Fragment.ID