new_scrape_algorithm_#5 (#20)

First implementation of the new algorithm that got proposed in issue #5

Reviewed-on: anthrove/e621-to-graph#20
Reviewed-by: Lennard Brinkhaus <lennard.brinkhaus@noreply.localhost>
Reviewed-by: daskadse <daskadse@noreply.localhost>
Co-authored-by: SoXX <soxx@fenpa.ws>
Co-committed-by: SoXX <soxx@fenpa.ws>
This commit is contained in:
SoXX 2023-11-15 20:28:44 +00:00 committed by Lennard Brinkhaus
parent 60b3502ee3
commit 3be16a9277
8 changed files with 163 additions and 103 deletions

View File

@ -20,6 +20,10 @@ func NewNeo4JConnection(neo4jDebug bool) logic.GraphConnection {
}
}
func (c *neo4jConnection) GetUserFavoriteCount(ctx context.Context, userID model.UserID) (int64, error) {
return GetUserFavoritesCount(ctx, c.driver, userID)
}
func (c *neo4jConnection) CheckUserToPostLink(ctx context.Context, e621PostID model.PostID, e621UserID model.UserID) (bool, error) {
return CheckUserToPostLink(ctx, c.driver, e621PostID, e621UserID)
}

View File

@ -8,9 +8,8 @@ import (
func CreatePostNode(ctx context.Context, driver neo4j.DriverWithContext, postID model.PostID) error {
query := `
MERGE (u:e621Post {e621PostID: $postID})
RETURN u
`
MERGE (u:e621Post {e621PostID: $postID});
`
params := map[string]any{
"postID": postID,
}

View File

@ -11,7 +11,7 @@ func EstablishPostTagLink(ctx context.Context, driver neo4j.DriverWithContext, e
query := `
MATCH (p:e621Post {e621PostID: $e621PostID})
MATCH (t:e621Tag {e621Tag: $e621Tag})
MERGE (p)-[:HAS_TAG]->(t)
MERGE (p)-[:HAS_TAG]->(t);
`
params := map[string]interface{}{
"e621PostID": e621PostID,

View File

@ -7,9 +7,8 @@ import (
func CreateSourceNode(ctx context.Context, driver neo4j.DriverWithContext, URL string) error {
query := `
MERGE (u:Source {URL: $url})
RETURN u
`
MERGE (u:Source {URL: $url});
`
params := map[string]any{
"url": URL,
}

View File

@ -8,9 +8,8 @@ import (
func CreateTagNode(ctx context.Context, driver neo4j.DriverWithContext, name string, tagType string) error {
query := `
MERGE (u:e621Tag {e621Tag: $name, e621TagType: $tagType})
RETURN u
`
MERGE (u:e621Tag {e621Tag: $name, e621TagType: $tagType});
`
params := map[string]interface{}{
"name": name,
"tagType": tagType,
@ -30,7 +29,7 @@ func GetTagNodeByName(ctx context.Context, driver neo4j.DriverWithContext, name
query := `
MATCH (u:e621Tag {e621Tag: $name})
RETURN u.e621Tag as e621Tag, u.e621TagType as e621TagType
RETURN u.e621Tag AS e621Tag, u.e621TagType AS e621TagType;
`
params := map[string]interface{}{

View File

@ -8,8 +8,7 @@ import (
func CreateUserNode(ctx context.Context, driver neo4j.DriverWithContext, user model.User) error {
query := `
MERGE (u:e621User {e621ID: $id, e621Username: $name})
RETURN u
MERGE (u:e621User {e621ID: $id, e621Username: $name});
`
params := map[string]interface{}{
"id": user.ID,
@ -22,3 +21,34 @@ func CreateUserNode(ctx context.Context, driver neo4j.DriverWithContext, user mo
}
return nil
}
func GetUserFavoritesCount(ctx context.Context, driver neo4j.DriverWithContext, userID model.UserID) (int64, error) {
var userFavoriteCount int64
query := `
MATCH (:e621User {e621ID: $userID})-[:IS_FAVORITE]->(:e621Post)
RETURN count(*) AS numberOfFavoritedPosts;
`
params := map[string]interface{}{
"userID": userID,
}
result, err := neo4j.ExecuteQuery(ctx, driver, query, params, neo4j.EagerResultTransformer)
if err != nil {
return 0, err
}
if len(result.Records) == 0 {
// no matches -> user does not exist, return count 0
return userFavoriteCount, err
}
record := result.Records[0]
userFavoriteCount, _, err = neo4j.GetRecordValue[int64](record, "numberOfFavoritedPosts")
if err != nil {
return userFavoriteCount, err
}
return userFavoriteCount, nil
}

View File

@ -13,9 +13,10 @@ import (
func ScrapeUser(ctx context.Context, graphConnection logic.GraphConnection, client *e621.Client, username string) error {
var err error
scrapeTime := time.Now()
e621User, err := client.GetUserByName(username).Execute()
if err != nil {
log.Info(err)
return err
}
@ -28,107 +29,134 @@ func ScrapeUser(ctx context.Context, graphConnection logic.GraphConnection, clie
return nil
}
log.WithFields(log.Fields{
"e621_username": e621User.Name,
"e621_user_id": e621User.ID,
}).Info("service: processing user")
err = graphConnection.UploadUser(ctx, e621User)
if err != nil {
log.Fatal(err)
return err
}
currentDBFavCount, err := graphConnection.GetUserFavoriteCount(ctx, e621User.ID)
if err != nil {
return err
}
favoriteBuilder, err := client.GetFavoritesForUser(e621User.Name)
if err != nil {
return err
}
if currentDBFavCount > e621User.FavoriteCount {
//TODO: IMPLEMENT USER MARKED FOR DELETED FAVS
log.WithFields(log.Fields{
"e621_username": e621User.Name,
"e621_user_id": e621User.ID,
}).Info("service: start processing favorites")
start := time.Now()
"e621_current_db_favorite_count": currentDBFavCount,
"e621_user_favorite_count": e621User.FavoriteCount,
}).Debug("service: user has favorites deleted")
}
e621FavoritesBuilder := client.GetFavoritesBuilder().SetUserID(e621User.ID)
e621Favorites, err := client.GetAllFavoritesForUser(e621FavoritesBuilder)
var pageIndex = 1
for currentDBFavCount < e621User.FavoriteCount {
// Uploads all Tags, Posts as Nodes to Neo4j
for i, post := range e621Favorites {
if exists, err := graphConnection.CheckUserToPostLink(ctx, post.ID, e621User.ID); err == nil && exists {
log.WithFields(log.Fields{
"e621_username": e621User.Name,
"e621_user_id": e621User.ID,
"last_post_id": post.ID,
}).Info("service: no new favorites found")
favorites, err := favoriteBuilder.Page(pageIndex).Execute()
if err != nil {
return err
}
if len(favorites) <= 0 {
return nil
}
for _, favorite := range favorites {
if currentDBFavCount == e621User.FavoriteCount {
break
} else if err != nil {
return err
}
start = time.Now()
err = uploadNodes(ctx, graphConnection, post)
isFaved, err := graphConnection.CheckUserToPostLink(ctx, favorite.ID, e621User.ID)
if err != nil {
return err
}
if !isFaved {
err = uploadDataToDB(ctx, graphConnection, favorite, e621User)
if err != nil {
return err
}
currentDBFavCount++
}
}
pageIndex++
}
log.WithFields(log.Fields{
"e621_username": e621User.Name,
"e621_user_id": e621User.ID,
"post_number": i,
"post_amount": len(e621Favorites),
"post_id": post.ID,
"upload_time": time.Since(start),
}).Debug("service: uploading post")
start := time.Now()
err = uploadPostToUserRelationship(ctx, graphConnection, post, e621User)
if err != nil {
log.Fatal(err)
return err
}
err = uploadSourceTagRelationship(ctx, graphConnection, post)
if err != nil {
log.Fatal(err)
return err
}
err = uploadGeneralTagRelationship(ctx, graphConnection, post)
if err != nil {
log.Fatal(err)
return err
}
err = uploadCharacterTagtRelationship(ctx, graphConnection, post)
if err != nil {
log.Fatal(err)
return err
}
err = uploadCopyrightTagRelationship(ctx, graphConnection, post)
if err != nil {
log.Fatal(err)
return err
}
err = uploadArtistTagRelationship(ctx, graphConnection, post)
if err != nil {
log.Fatal(err)
return err
}
log.WithFields(log.Fields{
"e621_username": e621User.Name,
"e621_user_id": e621User.ID,
"post_number": i,
"post_amount": len(e621Favorites),
"post_id": post.ID,
"upload_time": time.Since(start),
}).Debug("service: making relationship")
}
log.WithFields(log.Fields{
"e621_username": e621User.Name,
"e621_user_id": e621User.ID,
"post_amount": len(e621Favorites),
"scrape_time": time.Since(start),
"post_amount": e621User.FavoriteCount,
"scrape_time": time.Since(scrapeTime),
}).Info("service: finished processing favorites")
return nil
}
func uploadDataToDB(ctx context.Context, graphConnection logic.GraphConnection, favorite model.Post, e621User model.User) error {
start := time.Now()
err := uploadNodes(ctx, graphConnection, favorite)
if err != nil {
return err
}
log.WithFields(log.Fields{
"e621_username": e621User.Name,
"e621_user_id": e621User.ID,
"post_id": favorite.ID,
"upload_time": time.Since(start),
}).Debug("service: uploaded post")
start = time.Now()
err = uploadPostToUserRelationship(ctx, graphConnection, favorite, e621User)
if err != nil {
log.Fatal(err)
return err
}
err = uploadSourceTagRelationship(ctx, graphConnection, favorite)
if err != nil {
log.Fatal(err)
return err
}
err = uploadGeneralTagRelationship(ctx, graphConnection, favorite)
if err != nil {
log.Fatal(err)
return err
}
err = uploadCharacterTagtRelationship(ctx, graphConnection, favorite)
if err != nil {
log.Fatal(err)
return err
}
err = uploadCopyrightTagRelationship(ctx, graphConnection, favorite)
if err != nil {
log.Fatal(err)
return err
}
err = uploadArtistTagRelationship(ctx, graphConnection, favorite)
if err != nil {
log.Fatal(err)
return err
}
log.WithFields(log.Fields{
"e621_username": e621User.Name,
"e621_user_id": e621User.ID,
"post_id": favorite.ID,
"upload_time": time.Since(start),
}).Debug("service: made relationship")
return nil
}
// uploadNodes uploads the post to the database and creates the nodes
func uploadNodes(ctx context.Context, graphConnection logic.GraphConnection, post model.Post) error {

View File

@ -15,4 +15,5 @@ type GraphConnection interface {
EstablishPostToSourceLink(ctx context.Context, e621PostID model.PostID, sourceURL string) error
EstablishUserToPostLink(ctx context.Context, e621PostID model.PostID, e621UserID model.UserID) error
CheckUserToPostLink(ctx context.Context, e621PostID model.PostID, e621UserID model.UserID) (bool, error)
GetUserFavoriteCount(ctx context.Context, userID model.UserID) (int64, error)
}