Skip to content

Commit

Permalink
save a lot of memory by using pointers
Browse files Browse the repository at this point in the history
  • Loading branch information
XORbit committed Feb 23, 2024
1 parent e9de4e9 commit 30f856f
Show file tree
Hide file tree
Showing 3 changed files with 45 additions and 23 deletions.
40 changes: 22 additions & 18 deletions core/crawler.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,6 @@ package core
import (
"crypto/tls"
"fmt"
"github.com/briandowns/spinner"
"github.com/fatih/color"
"io"
"net/http"
"os"
Expand All @@ -15,6 +13,9 @@ import (
"syscall"
"time"

"github.com/briandowns/spinner"
"github.com/fatih/color"

"github.com/Malwarize/webpalm/v2/shared"
"github.com/Malwarize/webpalm/v2/webtree"
)
Expand Down Expand Up @@ -163,7 +164,9 @@ func (c *Crawler) ExtractLinks(page *webtree.Page) (links []string) {
continue
}
// check if it is a relative url
if strings.HasPrefix(match[1], "/") || strings.HasPrefix(match[1], "./") || strings.HasPrefix(match[1], "../") || strings.HasSuffix(match[1], "/") {
if strings.HasPrefix(match[1], "/") || strings.HasPrefix(match[1], "./") ||
strings.HasPrefix(match[1], "../") ||
strings.HasSuffix(match[1], "/") {
u, err := page.ConvertToAbsoluteURL(match[1])
if err != nil {
continue
Expand All @@ -174,7 +177,7 @@ func (c *Crawler) ExtractLinks(page *webtree.Page) (links []string) {
return
}

func (c *Crawler) ExportJSON(root webtree.Node, filename string) error {
func (c *Crawler) ExportJSON(root *webtree.Node, filename string) error {
data, err := root.SprintJSON()
if err != nil {
return err
Expand All @@ -186,7 +189,7 @@ func (c *Crawler) ExportJSON(root webtree.Node, filename string) error {
return nil
}

func (c *Crawler) ExportTXT(root webtree.Node, filename string) error {
func (c *Crawler) ExportTXT(root *webtree.Node, filename string) error {
data, err := root.SprintTXT()
if err != nil {
return err
Expand All @@ -198,7 +201,7 @@ func (c *Crawler) ExportTXT(root webtree.Node, filename string) error {
return nil
}

func (c *Crawler) ExportXML(tree webtree.Node, filename string) error {
func (c *Crawler) ExportXML(tree *webtree.Node, filename string) error {
data, err := tree.SprintXML()
if err != nil {
return err
Expand All @@ -210,7 +213,7 @@ func (c *Crawler) ExportXML(tree webtree.Node, filename string) error {
return nil
}

func (c *Crawler) Export(tree webtree.Node, format string, filename string) error {
func (c *Crawler) Export(tree *webtree.Node, format string, filename string) error {
if format == "json" {
err := c.ExportJSON(tree, filename)
if err != nil {
Expand Down Expand Up @@ -253,7 +256,7 @@ func (c *Crawler) isSkipableUrl(u string) bool {
return true
}

func (c *Crawler) IsSkipablePage(page webtree.Page) bool {
func (c *Crawler) IsSkipablePage(page *webtree.Page) bool {
isInCode := func(status int, arr []int) bool {
for _, v := range arr {
if v == status {
Expand All @@ -271,7 +274,7 @@ func (c *Crawler) IsSkipablePage(page webtree.Page) bool {
return false
}

func (c *Crawler) AddMatches(page webtree.Page) {
func (c *Crawler) AddMatches(page *webtree.Page) {
for rname, regex := range c.RegexMap {
r := regexp.MustCompile(regex)
matches := r.FindAllString(page.GetData(), -1)
Expand All @@ -282,7 +285,7 @@ func (c *Crawler) AddMatches(page webtree.Page) {
}

func (c *Crawler) ProcessANode(node *webtree.Node) {
c.Fetch(&node.Page)
c.Fetch(node.Page)
c.AddMatches(node.Page)
if c.IsSkipablePage(node.Page) {
return
Expand All @@ -291,7 +294,7 @@ func (c *Crawler) ProcessANode(node *webtree.Node) {
if c.Level < 1 {
return
}
links := c.ExtractLinks(&node.Page)
links := c.ExtractLinks(node.Page)
for _, link := range links {
if c.isSkipableUrl(link) {
continue
Expand Down Expand Up @@ -325,7 +328,7 @@ func (c *Crawler) CrawlNodeBlock(w *webtree.Node, levelChangedChan chan int) {
close(tasks)
wg.Wait()

//signal to spinner that level has changed
// signal to spinner that level has changed
c.Level--
levelChangedChan <- level - c.Level
}
Expand All @@ -337,7 +340,7 @@ func (c *Crawler) CrawlNodeLive(w *webtree.Node) {
if level < 0 {
return
}
c.Fetch(&w.Page)
c.Fetch(w.Page)

if c.Delay > 0 {
time.Sleep(time.Duration(c.Delay) * time.Millisecond)
Expand All @@ -358,7 +361,7 @@ func (c *Crawler) CrawlNodeLive(w *webtree.Node) {
// add visited node to cache
c.Cache.AddVisited(w.Page.GetUrl())

links := c.ExtractLinks(&w.Page)
links := c.ExtractLinks(w.Page)

// add children
for i, link := range links {
Expand All @@ -373,7 +376,7 @@ func (c *Crawler) CrawlNodeLive(w *webtree.Node) {
f(w, c.Level, "", true)
}

func (c *Crawler) SaveResults(root webtree.Node) {
func (c *Crawler) SaveResults(root *webtree.Node) {
if strings.HasSuffix(c.ExportFile, ".txt") {
err := c.Export(root, "txt", c.ExportFile)
if err != nil {
Expand All @@ -393,7 +396,8 @@ func (c *Crawler) SaveResults(root webtree.Node) {
}

func (c *Crawler) Crawl() {
root := webtree.Node{}
// root := *webtree.Node{}
root := webtree.NewNode(webtree.NewPage(), nil, make([]*webtree.Node, 0))
root.Page.SetUrl(c.RootURL)
interruptChan := make(chan os.Signal, 1)
signal.Notify(interruptChan, syscall.SIGINT, syscall.SIGTERM)
Expand All @@ -413,7 +417,7 @@ func (c *Crawler) Crawl() {

// live mode or block mode
if c.Workers == 0 {
c.CrawlNodeLive(&root)
c.CrawlNodeLive(root)
} else {
color.Yellow("NOTE: This program is running in parallel mode, so you won't be able to see the output directly until the tree is entirely built. You can observe the output in your saved file, which is updated at each level traversal.")
LevelChangedChan := make(chan int, 1)
Expand All @@ -438,7 +442,7 @@ func (c *Crawler) Crawl() {
}
}()

c.CrawlNodeBlock(&root, LevelChangedChan)
c.CrawlNodeBlock(root, LevelChangedChan)
root.Display()
}
fmt.Println("\033[?25h")
Expand Down
1 change: 0 additions & 1 deletion shared/options.go
Original file line number Diff line number Diff line change
Expand Up @@ -216,7 +216,6 @@ func ValidateThenBuildOption(cmd *cobra.Command) (*Options, error) {
} else {
parsedProxy = nil
}

timeout, err := cmd.Flags().GetInt("timeout")
if err != nil {
return nil, err
Expand Down
27 changes: 23 additions & 4 deletions webtree/webtree.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,19 +6,35 @@ import (
)

type Node struct {
Page Page
Page *Page
Parent *Node
Children []*Node
mutex sync.Mutex
mutex *sync.Mutex
}

func NewNode(
page *Page,
parent *Node,
children []*Node,
) *Node {
return &Node{
Page: page,
Parent: parent,
Children: children,
mutex: &sync.Mutex{},
}
}

func (node *Node) AddChild(page *Page) *Node {
node.mutex.Lock()
defer node.mutex.Unlock()
child := &Node{Page: *page, Parent: node}
// child := &Node{Page: *page, Parent: node}
// node.Children = append(node.Children, child)
child := NewNode(page, node, make([]*Node, 0))
node.Children = append(node.Children, child)
return child
}

func (node *Node) GetChildren() []*Node {
node.mutex.Lock()
defer node.mutex.Unlock()
Expand Down Expand Up @@ -83,7 +99,10 @@ func (node *Node) ToXMLPage() *XmlPage {
exportNode.StatusCode = node.Page.GetStatusCode()
for name, results := range node.Page.GetResults() {
for _, result := range results {
exportNode.Results = append(exportNode.Results, &XmlPageResult{Pattern: name, Result: []string{result}})
exportNode.Results = append(
exportNode.Results,
&XmlPageResult{Pattern: name, Result: []string{result}},
)
}
}
exportNode.Children = make([]*XmlPage, 0)
Expand Down

0 comments on commit 30f856f

Please sign in to comment.