Skip to content

Commit

Permalink
Merge pull request getmaxun#202 from getmaxun/develop
Browse files Browse the repository at this point in the history
feat: feat: handle relative src & href paths in scrapeList & scrapeSchema (missing in v0.0.2)
  • Loading branch information
amhsirak authored Nov 23, 2024
2 parents 9c6deaa + 600c36d commit 36ebff4
Show file tree
Hide file tree
Showing 5 changed files with 21 additions and 10 deletions.
2 changes: 1 addition & 1 deletion docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ services:
#build:
#context: .
#dockerfile: server/Dockerfile
image: getmaxun/maxun-backend:v0.0.1
image: getmaxun/maxun-backend:v0.0.2
ports:
- "8080:8080"
env_file: .env
Expand Down
9 changes: 7 additions & 2 deletions maxun-core/package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "maxun-core",
"version": "0.0.3",
"version": "0.0.4",
"description": "Core package for Maxun, responsible for data extraction",
"main": "build/index.js",
"typings": "build/index.d.ts",
Expand All @@ -20,7 +20,12 @@
"automation",
"workflow",
"data extraction",
"scraping"
"scraping",
"web scraper",
"web scraping",
"data scraping",
"no-code web scraper",
"no-code web scraping"
],
"author": "Maxun",
"license": "AGPL-3.0-or-later",
Expand Down
14 changes: 10 additions & 4 deletions maxun-core/src/browserSide/scraper.js
Original file line number Diff line number Diff line change
Expand Up @@ -235,9 +235,11 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,

switch (attribute) {
case 'href':
return elem.getAttribute('href');
const relativeHref = elem.getAttribute('href');
return relativeHref ? new URL(relativeHref, window.location.origin).href : null;
case 'src':
return elem.getAttribute('src');
const relativeSrc = elem.getAttribute('src');
return relativeSrc ? new URL(relativeSrc, window.location.origin).href : null;
case 'innerText':
return elem.innerText;
case 'textContent':
Expand Down Expand Up @@ -281,9 +283,13 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
} else if (attribute === 'innerHTML') {
record[label] = fieldElement.innerHTML.trim();
} else if (attribute === 'src') {
record[label] = fieldElement.src;
// Handle relative 'src' URLs
const src = fieldElement.getAttribute('src');
record[label] = src ? new URL(src, baseUrl).href : null;
} else if (attribute === 'href') {
record[label] = fieldElement.href;
// Handle relative 'href' URLs
const href = fieldElement.getAttribute('href');
record[label] = href ? new URL(href, baseUrl).href : null;
} else {
record[label] = fieldElement.getAttribute(attribute);
}
Expand Down
4 changes: 2 additions & 2 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@
"jwt-decode": "^4.0.0",
"loglevel": "^1.8.0",
"loglevel-plugin-remote": "^0.6.8",
"maxun-core": "^0.0.3",
"maxun-core": "0.0.4",
"minio": "^8.0.1",
"moment-timezone": "^0.5.45",
"node-cron": "^3.0.3",
Expand Down Expand Up @@ -110,4 +110,4 @@
"ts-node": "^10.4.0",
"vite": "^5.4.10"
}
}
}
2 changes: 1 addition & 1 deletion server/src/workflow-management/utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ export const getBestSelectorForAction = (action: Action) => {
selectors?.text?.length != null &&
selectors?.text?.length < 25 &&
action.hasOnlyText
? `text=${selectors.text}`
? selectors.generalSelector
: null;

if (action.tagName === TagName.Input) {
Expand Down

0 comments on commit 36ebff4

Please sign in to comment.