Skip to content

Commit

Permalink
cleanup and refactoring; renamed TextRun to TagPair
Browse files Browse the repository at this point in the history
  • Loading branch information
lukasjarosch committed Feb 2, 2021
1 parent 0566f78 commit fddd06c
Show file tree
Hide file tree
Showing 8 changed files with 155 additions and 131 deletions.
3 changes: 3 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,6 @@
test:
@go test -v .

gofmt:
@gofmt -w *.go

5 changes: 2 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -103,12 +103,11 @@ To not cause too much confusion, here is a list of terms which you might come ac

* **Parser**: Every file which this lib handles (document, footers and headers) has their own parser attached since everything is relative to the underlying byte-slice (aka. file).
* **Position**: A Position is just a `Start` and `End` offset, relative to the byte slice of the document of a parser.
* **Run**: Describes the pair `<w:r>` and `</w:r>` and thus has two `Positions` for the open and close tag. Since they are Positions, they have a `Start` and `End` Position which point to `<` and `>` of the tag. A run also consists of a `TextRun`.
* **TextRun**: Is always nested inside a run and describes the tags `<w:t>` and `</w:t>`. It also just consists of two `Positions`. The type was just created for clarity and does not have special functionality.
* **Run**: Describes the pair `<w:r>` and `</w:r>` and thus has two `Positions` for the open and close tag. Since they are Positions, they have a `Start` and `End` Position which point to `<` and `>` of the tag. A run also consists of a `TagPair`.

* **Placeholder**: A Placeholder is basically just a list of `PlaceholderFragments` representing a full placeholder extracted by a `Parser`.
* **PlaceholderFragment**: A PlaceholderFragment is a parsed fragment of a placeholder since those will most likely be ripped apart by WordprocessingML. The Placeholder `{foo-bar-baz}` might ultimately consist of 5 fragments ( `{`, `foo-`, `bar-`, `baz`, `}`).
The fragment is at the heart of replacing. It knows to which `Run` it belongs to and has methods of manipulating these byte-offsets. Additionally it has a `Position` which describes the offset inside the `TextRun` since the fragments don't always start at the beginning of one (e.g. `<w:t>some text {fragment-start</w:t>`)
The fragment is at the heart of replacing. It knows to which `Run` it belongs to and has methods of manipulating these byte-offsets. Additionally it has a `Position` which describes the offset inside the `TagPair` since the fragments don't always start at the beginning of one (e.g. `<w:t>some text {fragment-start</w:t>`)

### ➤ How it works
This section will give you a short overview of what's actually going on.
Expand Down
6 changes: 1 addition & 5 deletions document.go
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ func OpenBytes(b []byte) (*Document, error) {
// newDocument will create a new document struct given the zipFile.
// The params 'path' and 'docxFile' may be empty/nil in case the document is created from a byte source directly.
//
// newDocument will parse the docx archive and ValidateRuns that at least a 'document.xml' exists.
// newDocument will parse the docx archive and ValidatePositions that at least a 'document.xml' exists.
// If 'word/document.xml' is missing, an error is returned since the docx cannot be correct.
// Then all files are parsed for their runs before returning the new document.
func newDocument(zipFile *zip.Reader, path string, docxFile *os.File) (*Document, error) {
Expand Down Expand Up @@ -416,10 +416,6 @@ func (fm FileMap) Write(writer io.Writer, filename string) error {
return fmt.Errorf("file not found %s", filename)
}

// cleanup the file in order to solve known compatibility issues
// MS Word will not open the document if a file contains a singleton text <w:t/>, thus they need to be removed
//file = []byte(strings.Replace(string(file), "<w:t/>", "", -1))

_, err := writer.Write(file)
if err != nil && err != io.EOF {
return fmt.Errorf("unable to writeFile '%s': %s", filename, err)
Expand Down
123 changes: 59 additions & 64 deletions parse.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,14 +24,14 @@ var (
RunCloseTagRegex = regexp.MustCompile(`(</w:r>)`)
// RunSingletonTagRegex matches a singleton run tag
RunSingletonTagRegex = regexp.MustCompile(`(<w:r/>)`)
// TextRunOpenTagRegex matches all OpenTags for text-runs, including eventually set attributes
TextRunOpenTagRegex = regexp.MustCompile(`(<w:t).*>`)
// TextRunCloseTagRegex matches the close tag of text-runs
TextRunCloseTagRegex = regexp.MustCompile(`(</w:t>)`)
// ErrParsingFailed is returned if the parsing failed and the result cannot be used.
// TextOpenTagRegex matches all OpenTags for text-runs, including eventually set attributes
TextOpenTagRegex = regexp.MustCompile(`(<w:t).*>`)
// TextCloseTagRegex matches the close tag of text-runs
TextCloseTagRegex = regexp.MustCompile(`(</w:t>)`)
// ErrTagsInvalid is returned if the parsing failed and the result cannot be used.
// Typically this means that one or more tag-offsets were not parsed correctly which
// would cause the document to become corrupted as soon as replacing starts.
ErrParsingFailed = errors.New("failed to parse the document, cannot continue")
ErrTagsInvalid = errors.New("one or more tags are invalid and will cause the XML to be corrupt")
)

// RunParser can parse a list of Runs from a given byte slice.
Expand Down Expand Up @@ -63,53 +63,14 @@ func (parser *RunParser) Execute() error {
return err
}

return ValidateRuns(parser.doc, parser.runs)
return ValidatePositions(parser.doc, parser.runs)
}

// Runs returns the all runs found by the parser.
func (parser *RunParser) Runs() DocumentRuns {
return parser.runs
}

// ValidateRuns will iterate over all runs and their texts (if any) and ensure that they match
// their respective regex.
// If the validation failed, the replacement will not work since offsets are wrong.
func ValidateRuns(document []byte, runs []*Run) error {
parsingFailed := false
for _, run := range runs {

// singleton tags must not be validated
if RunSingletonTagRegex.MatchString(string(document[run.OpenTag.Start:run.OpenTag.End])) {
continue
}

if !RunOpenTagRegex.MatchString(string(document[run.OpenTag.Start:run.OpenTag.End])) {
log.Println("RunOpenTagRegex failed to match", run.String(document))
parsingFailed = true
}
if !RunCloseTagRegex.MatchString(string(document[run.CloseTag.Start:run.CloseTag.End])) {
log.Println("RunCloseTagRegex failed to match", run.String(document))
parsingFailed = true
}

if run.HasText {
if !TextRunOpenTagRegex.MatchString(string(document[run.Text.StartTag.Start:run.Text.StartTag.End])) {
log.Println("TextRunOpenTagRegex failed to match", run.String(document))
parsingFailed = true
}
if !TextRunCloseTagRegex.MatchString(string(document[run.Text.EndTag.Start:run.Text.EndTag.End])) {
log.Println("TextRunCloseTagRegex failed to match", run.String(document))
parsingFailed = true
}
}
}
if parsingFailed {
return ErrParsingFailed
}

return nil
}

// FindRuns will search through the document and return all runs found.
// The text tags are not analyzed at this point, that'str the next step.
func (parser *RunParser) findRuns() error {
Expand All @@ -120,8 +81,8 @@ func (parser *RunParser) findRuns() error {
tmpRun := NewEmptyRun()
singleton := false

// nestCount holds the nesting-level. It is going to be incremented on every StartTag and decremented
// on every EndTag.
// nestCount holds the nesting-level. It is going to be incremented on every OpenTag and decremented
// on every CloseTag.
nestCount := 0

// popRun will pop the last Run from the runStack if there is any on the stack
Expand All @@ -132,7 +93,7 @@ func (parser *RunParser) findRuns() error {
}

// nextIteration resets the temporary values used inside the for-loop to be ready for the next iteration
// This is used after a run has been fully analyzed (StartTag and EndTag were found).
// This is used after a run has been fully analyzed (OpenTag and CloseTag were found).
// As long as there are runs on the runStack, they will be popped from it.
// Only when the stack is empty, a new empty Run struct is created.
nextIteration := func() {
Expand Down Expand Up @@ -186,7 +147,7 @@ func (parser *RunParser) findRuns() error {
if elem.Name.Local == RunElementName {

// if the run is a singleton tag, it was already identified by the xml.StartElement case
// in that case, the EndTag is the same as the openTag and no further work needs to be done
// in that case, the CloseTag is the same as the openTag and no further work needs to be done
if singleton {
tmpRun.CloseTag = tmpRun.OpenTag
parser.runs = append(parser.runs, tmpRun) // run is finished
Expand All @@ -213,7 +174,7 @@ func (parser *RunParser) findRuns() error {

if nestCount != 0 {
log.Printf("invalid nestCount, should be 0 but is %d\n", nestCount)
return ErrParsingFailed
return ErrTagsInvalid
}

return nil
Expand Down Expand Up @@ -257,7 +218,7 @@ func (parser *RunParser) findTextRuns() error {
return fmt.Errorf("unable to find currentRun for text start-element")
}
currentRun.HasText = true
currentRun.Text.StartTag = Position{
currentRun.Text.OpenTag = Position{
Start: tagStartPos,
End: tagEndPos,
}
Expand All @@ -275,7 +236,7 @@ func (parser *RunParser) findTextRuns() error {
if currentRun == nil {
return fmt.Errorf("unable to find currentRun for text end-element")
}
currentRun.Text.EndTag = Position{
currentRun.Text.CloseTag = Position{
Start: tagStartPos,
End: tagEndPos,
}
Expand All @@ -297,22 +258,56 @@ func (parser *RunParser) findOpenBracketPos(endBracketPos int64) int64 {
return 0
}

// TagPosition returns a filled Position struct given the end position and the tag itself.
func TagPosition(endPos int64, tag string) (tp Position) {
tp.End = endPos
tp.Start = endPos - int64(len(tag))
return tp
}
// ValidatePositions will iterate over all runs and their texts (if any) and ensure that they match
// their respective regex.
// If the validation failed, the replacement will not work since offsets are wrong.
func ValidatePositions(document []byte, runs []*Run) error {
parsingFailed := false
for _, run := range runs {

// singleton tags must not be validated
if run.OpenTag.Match(RunSingletonTagRegex, document) {
continue
}

if !run.OpenTag.Match(RunOpenTagRegex, document) {
log.Println("RunOpenTagRegex failed to match", run.String(document))
parsingFailed = true
}
if !run.CloseTag.Match(RunCloseTagRegex, document) {
log.Println("RunCloseTagRegex failed to match", run.String(document))
parsingFailed = true
}

// TextRun defines the <w:t> element which contains the actual literal text data.
// A TextRun is always a child of a Execute.
type TextRun struct {
StartTag Position
EndTag Position
if run.HasText {
if !run.Text.OpenTag.Match(TextOpenTagRegex, document) {
log.Println("TextOpenTagRegex failed to match", run.String(document))
parsingFailed = true
}
if !run.Text.CloseTag.Match(TextCloseTagRegex, document) {
log.Println("TextCloseTagRegex failed to match", run.String(document))
parsingFailed = true
}
}
}
if parsingFailed {
return ErrTagsInvalid
}

return nil
}

// Position is a generic position of a tag, represented by byte offsets
type Position struct {
Start int64
End int64
}

// Match will apply a MatchString using the given regex on the given data and returns true if the position
// matches the regex inside the data.
func (p Position) Match(regexp *regexp.Regexp, data []byte) bool {
if !regexp.MatchString(string(data[p.Start:p.End])) {
return false
}
return true
}
6 changes: 3 additions & 3 deletions placeholder.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ type Placeholder struct {
func (p Placeholder) Text(docBytes []byte) string {
str := ""
for _, fragment := range p.Fragments {
s := fragment.Run.Text.StartTag.End
s := fragment.Run.Text.OpenTag.End
t := docBytes[s+fragment.Position.Start : s+fragment.Position.End]
str += string(t)
}
Expand All @@ -43,13 +43,13 @@ func (p Placeholder) Text(docBytes []byte) string {

// StartPos returns the absolute start position of the placeholder.
func (p Placeholder) StartPos() int64 {
return p.Fragments[0].Run.Text.StartTag.End + p.Fragments[0].Position.Start
return p.Fragments[0].Run.Text.OpenTag.End + p.Fragments[0].Position.Start
}

// EndPos returns the absolute end position of the placeholder.
func (p Placeholder) EndPos() int64 {
end := len(p.Fragments) - 1
return p.Fragments[end].Run.Text.StartTag.End + p.Fragments[end].Position.End
return p.Fragments[end].Run.Text.OpenTag.End + p.Fragments[end].Position.End
}

// ParsePlaceholders will, given the document run positions and the bytes, parse out all placeholders including
Expand Down
22 changes: 11 additions & 11 deletions placeholder_fragment.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,10 +38,10 @@ func (p *PlaceholderFragment) ShiftAll(deltaLength int64) {
p.Run.OpenTag.End += deltaLength
p.Run.CloseTag.Start += deltaLength
p.Run.CloseTag.End += deltaLength
p.Run.Text.StartTag.Start += deltaLength
p.Run.Text.StartTag.End += deltaLength
p.Run.Text.EndTag.Start += deltaLength
p.Run.Text.EndTag.End += deltaLength
p.Run.Text.OpenTag.Start += deltaLength
p.Run.Text.OpenTag.End += deltaLength
p.Run.Text.CloseTag.Start += deltaLength
p.Run.Text.CloseTag.End += deltaLength
}

// ShiftCut will shift the fragment position markers in such a way that the fragment can be considered empty.
Expand All @@ -52,8 +52,8 @@ func (p *PlaceholderFragment) ShiftAll(deltaLength int64) {
// If that data was removed from the document, the positions (not all positions) of the fragment need to be adjusted.
// The text positions are set equal (start == end).
func (p *PlaceholderFragment) ShiftCut(cutLength int64) {
p.Run.Text.EndTag.Start -= cutLength
p.Run.Text.EndTag.End -= cutLength
p.Run.Text.CloseTag.Start -= cutLength
p.Run.Text.CloseTag.End -= cutLength
p.Run.CloseTag.Start -= cutLength
p.Run.CloseTag.End -= cutLength
p.Position.End = p.Position.Start
Expand All @@ -64,21 +64,21 @@ func (p *PlaceholderFragment) ShiftCut(cutLength int64) {
// For example, the fragment text was 'placeholder' (11 bytes) which is replaced with 'a-super-awesome-value' (21 bytes)
// In that case the deltaLength would be 10. In order to accommodate for the change in bytes you'd need to call ShiftReplace(10)
func (p *PlaceholderFragment) ShiftReplace(deltaLength int64) {
p.Run.Text.EndTag.Start += deltaLength
p.Run.Text.EndTag.End += deltaLength
p.Run.Text.CloseTag.Start += deltaLength
p.Run.Text.CloseTag.End += deltaLength
p.Run.CloseTag.Start += deltaLength
p.Run.CloseTag.End += deltaLength
p.Position.End += deltaLength
}

// StartPos returns the absolute start position of the fragment.
func (p PlaceholderFragment) StartPos() int64 {
return p.Run.Text.StartTag.End + p.Position.Start
return p.Run.Text.OpenTag.End + p.Position.Start
}

// EndPos returns the absolute end position of the fragment.
func (p PlaceholderFragment) EndPos() int64 {
return p.Run.Text.StartTag.End + p.Position.End
return p.Run.Text.OpenTag.End + p.Position.End
}

// Text returns the actual text of the fragment given the source bytes.
Expand All @@ -100,7 +100,7 @@ func (p PlaceholderFragment) TextLength(docBytes []byte) int64 {
func (p PlaceholderFragment) String(docBytes []byte) string {
format := "fragment %d in %s with fragment text-positions: [%d:%d] '%s'"
return fmt.Sprintf(format, p.ID, p.Run.String(docBytes),
p.Position.Start, p.Position.End, docBytes[p.Run.Text.StartTag.End+p.Position.Start:p.Run.Text.StartTag.End+p.Position.End])
p.Position.Start, p.Position.End, docBytes[p.Run.Text.OpenTag.End+p.Position.Start:p.Run.Text.OpenTag.End+p.Position.End])
}

// NewFragmentID returns the next Fragment.ID
Expand Down
Loading

0 comments on commit fddd06c

Please sign in to comment.